In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
customers_df = pd.read_csv('/content/Customers.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')
products_df = pd.read_csv('/content/Products.csv')

In [25]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [26]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [27]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [28]:
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')

In [29]:
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')
merged_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [30]:
# Aggregate features for each customer
customer_features = merged_df.groupby('CustomerID').agg(
    total_spend = ('TotalValue', 'sum'),
    total_transactions = ('TransactionID', 'count'),
    avg_purchase_value = ('TotalValue', 'mean')
).reset_index()

In [31]:
# Let's also add region data
region_map = customers_df.set_index('CustomerID')['Region'].to_dict()
customer_features['Region'] = customer_features['CustomerID'].map(region_map)

In [32]:
# Encode Region (simple numeric encoding)
customer_features['RegionEncoded'] = customer_features['Region'].astype('category').cat.codes

In [33]:
# Example of adding category-based features (optional)
category_pivot = merged_df.pivot_table(index='CustomerID',
                                       columns='Category',
                                       values='Quantity',
                                       aggfunc='sum',
                                       fill_value=0)
category_pivot.columns = [f"cat_{c}" for c in category_pivot.columns]

In [34]:
# Merge category pivot with main feature set
customer_features = pd.merge(customer_features, category_pivot,
                             left_on='CustomerID', right_index=True, how='left')

In [35]:
# Fill any NaNs if present
customer_features.fillna(0, inplace=True)

In [36]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,total_transactions,avg_purchase_value,Region,RegionEncoded,cat_Books,cat_Clothing,cat_Electronics,cat_Home Decor
0,C0001,3354.52,5,670.904,South America,3,2,0,7,3
1,C0002,1862.74,4,465.685,Asia,0,0,4,0,6
2,C0003,2725.38,4,681.345,South America,3,0,4,4,6
3,C0004,5354.88,8,669.36,South America,3,8,0,6,9
4,C0005,2034.24,3,678.08,Asia,0,0,0,4,3


### Similarity Calculation

In [37]:
# Drop non-numeric columns for similarity
non_numeric_cols = ['CustomerID', 'Region']
feature_cols = [col for col in customer_features.columns if col not in non_numeric_cols]

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(customer_features[feature_cols])

In [38]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(X, X)

# Store results in a DataFrame for convenience
similarity_df = pd.DataFrame(similarity_matrix,
                             index=customer_features['CustomerID'],
                             columns=customer_features['CustomerID'])

In [39]:
# ==========================================
# 3. Lookalike Retrieval Function
# ==========================================
def get_top_3_lookalikes(target_customer_id):
    # Similarities for the target customer
    sim_scores = similarity_df.loc[target_customer_id].drop(target_customer_id, errors='ignore')
    # Sort by similarity descending
    top_3 = sim_scores.sort_values(ascending=False).head(3)
    return top_3


In [40]:
# ==========================================
# 4. Generate Lookalike Results for C0001 - C0020
# ==========================================
lookalike_dict = {}

for cid in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:
    if cid in similarity_df.index:
        top_3 = get_top_3_lookalikes(cid)
        # Convert to [(cust_id, score), (cust_id, score), (cust_id, score)]
        top_3_list = list(zip(top_3.index, top_3.values))
        lookalike_dict[cid] = top_3_list
    else:
        lookalike_dict[cid] = []

In [41]:
# Display an example
for k, v in list(lookalike_dict.items())[:5]:
    print("Customer:", k, " -> Lookalikes:", v)

Customer: C0001  -> Lookalikes: [('C0120', 0.8375700742048803), ('C0091', 0.7528618878928868), ('C0181', 0.7452080176710963)]
Customer: C0002  -> Lookalikes: [('C0178', 0.9102838885020159), ('C0159', 0.9019630398713462), ('C0164', 0.8584002421286306)]
Customer: C0003  -> Lookalikes: [('C0031', 0.820022130943904), ('C0133', 0.8155976927845391), ('C0195', 0.8039389808015782)]
Customer: C0004  -> Lookalikes: [('C0012', 0.9223281934185819), ('C0113', 0.9056976474686702), ('C0065', 0.9045802947871378)]
Customer: C0005  -> Lookalikes: [('C0007', 0.9332052988356209), ('C0197', 0.8868433909134634), ('C0140', 0.8856054904877151)]


In [42]:
rows = []
for cust_id, lookalikes in lookalike_dict.items():
    for lookalike_cust_id, score in lookalikes:
        rows.append((cust_id, lookalike_cust_id, score))

lookalike_df = pd.DataFrame(rows, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Amisha_Kumari_Lookalike.csv', index=False)