In [68]:
# Importing necessary libraries
# pandas: For data manipulation and analysis
# numpy: For numerical computations
# sklearn.metrics.pairwise: For calculating cosine similarity
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [70]:
#Loading the Transactions dataset
# The dataset contains CustomerID, ProductID, and Quantity columns.
transactions = pd.read_csv('Transactions.csv')

# Pivot the table to create a customer-product matrix
# Rows: CustomerID, Columns: ProductID, Values: Quantity (filled with 0 if no purchase)
customer_product_matrix = transactions.pivot_table(
    index='CustomerID',
    columns='ProductID',
    values='Quantity',
    fill_value=0
)

#Computing the cosine similarity matrix
# The similarity is computed between customers based on their purchase behavior.
similarity_matrix = cosine_similarity(customer_product_matrix)

# Converting the similarity matrix to a DataFrame for easy handling
# Rows and columns are labeled with CustomerIDs.
similarity_scores = pd.DataFrame(
    similarity_matrix,
    index=customer_product_matrix.index,
    columns=customer_product_matrix.index
)


In [74]:
#computing lookalikes
#Initializing an empty dictionary to store lookalike results
lookalike_results = {}

#Iterating through each customer to find their top 3 lookalikes
for customer_id in similarity_scores.index:
    # Getting similarity scores for the current customer
    similarity_scores_for_customer = similarity_scores.loc[customer_id].reset_index()
    similarity_scores_for_customer.columns = ['CustomerID', 'Score']
    
    # Excluding the current customer from their own similarity list
    similarity_scores_for_customer = similarity_scores_for_customer[
        similarity_scores_for_customer['CustomerID'] != customer_id
    ]
    
    # Sorting by similarity score in descending order
    top_lookalikes = similarity_scores_for_customer.sort_values(
        by='Score',
        ascending=False
    ).head(3)
    
    # Storing the results as a list of tuples (CustomerID, Score)
    lookalike_results[customer_id] = list(zip(top_lookalikes['CustomerID'], top_lookalikes['Score']))


In [78]:
# Creating and Validating Lookalike Data
# Converting the lookalike results dictionary to a DataFrame
lookalike_df = pd.DataFrame(
    {'CustomerID': list(lookalike_results.keys()),
     'Lookalikes': list(lookalike_results.values())}
)

# Ensuring all required CustomerIDs (C0001 to C0020) are included
# Creating a list of required CustomerIDs here first 20 customers
required_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

# Finding any missing CustomerIDs from the lookalike DataFrame
existing_customers = set(lookalike_df['CustomerID'])
missing_customers = set(required_customers) - existing_customers

# Add missing customers with empty lookalikes
for missing_customer in missing_customers:
    lookalike_df = lookalike_df.append(
        {"CustomerID": missing_customer, "Lookalikes": []},
        ignore_index=True
    )

# Sortingthe DataFrame by CustomerID for consistency
lookalike_df = lookalike_df.sort_values(by='CustomerID').reset_index(drop=True)


In [80]:
#Saving and Previewing the Final Lookalike File
# Saving the finalized Lookalike.csv file
# This file contains CustomerID and their top 3 lookalikes with similarity scores.
lookalike_df.to_csv("Lookalike.csv", index=False)

# Previewing the Lookalike DataFrame
# Displaying the top 20 rows of the Lookalike DataFrame for validation.
print("Lookalike Recommendations for First 20 Customers:")
print(lookalike_df.head(20))


Lookalike Recommendations for First 20 Customers:
   CustomerID                                         Lookalikes
0       C0001  [(C0097, 0.5477225575051661), (C0194, 0.469668...
1       C0002  [(C0091, 0.3801987652174059), (C0030, 0.372821...
2       C0003  [(C0134, 0.5199469468957452), (C0181, 0.517597...
3       C0004  [(C0070, 0.4988876515698588), (C0132, 0.384307...
4       C0005  [(C0096, 0.6482037235521645), (C0055, 0.514495...
5       C0006  [(C0058, 0.6488856845230502), (C0040, 0.580381...
6       C0007  [(C0020, 0.5883484054145521), (C0079, 0.496138...
7       C0008  [(C0144, 0.39223227027636803), (C0088, 0.33968...
8       C0009  [(C0140, 0.560112033611204), (C0162, 0.5132649...
9       C0010  [(C0033, 0.48666426339228763), (C0077, 0.42479...
10      C0011  [(C0135, 0.4841820261350419), (C0120, 0.356820...
11      C0012  [(C0101, 0.4437601569801833), (C0066, 0.396150...
12      C0013  [(C0058, 0.5345224838248488), (C0040, 0.478091...
13      C0014  [(C0128, 0.89442719099991