In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [3]:
merged = pd.read_csv('merged_data.csv')

In [4]:
merged.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [5]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [6]:
merged.describe()

Unnamed: 0,Quantity,TotalValue,Price_x,Price_y
count,1000.0,1000.0,1000.0,1000.0
mean,2.537,689.99556,272.55407,272.55407
std,1.117981,493.144478,140.73639,140.73639
min,1.0,16.08,16.08,16.08
25%,2.0,295.295,147.95,147.95
50%,3.0,588.88,299.93,299.93
75%,4.0,1011.66,404.4,404.4
max,4.0,1991.04,497.76,497.76


In [7]:
try:
    
    customer_profile = merged.groupby("CustomerID").agg({
        "TotalValue": ["sum", "mean"],  
        "Quantity": ["sum", "mean"],
        "ProductID": "nunique",  
        "TransactionID": "count"  
    }).reset_index()

    # Flatten column 
    customer_profile.columns = [
        "CustomerID", 
        "Total_Spend", 
        "Avg_Transaction_Value",
        "Total_Quantity", 
        "Avg_Quantity",
        "Unique_Products",
        "Transaction_Count"
    ]

    customer_demographics = merged.groupby("CustomerID").agg({
        "Region": "first",
        "SignupDate": "first"
    }).reset_index()

    # Calculate customer age 
    customer_demographics["SignupDate"] = pd.to_datetime(customer_demographics["SignupDate"])
    reference_date = pd.to_datetime("2025-01-27")  # Using your current date
    customer_demographics["Days_on_Platform"] = (
        reference_date - customer_demographics["SignupDate"]
    ).dt.days

    customer_profile = customer_profile.merge(
        customer_demographics[["CustomerID", "Region", "Days_on_Platform"]], 
        on="CustomerID"
    )

    category_pivot = pd.crosstab(
        merged["CustomerID"], 
        merged["Category"], 
        values=merged["Quantity"],
        aggfunc="sum",
        normalize="index"
    ).fillna(0)

    # Combine all 
    customer_profile = customer_profile.merge(
        category_pivot, 
        left_on="CustomerID",
        right_index=True
    )

    # Encode categorical
    customer_profile_encoded = pd.get_dummies(
        customer_profile, 
        columns=["Region"]
    )

    # Prepare features for similarity 
    features = customer_profile_encoded.drop("CustomerID", axis=1)

    # Normalize
    scaler = MinMaxScaler()
    features_normalized = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_normalized)

    # Create lookalike recommendations
    customer_ids = customer_profile["CustomerID"].tolist()
    lookalike_map = {}

    target_customers = sorted(customer_ids)[:20]  # First 20 customers
    
    for cust_id in target_customers:
        idx = customer_ids.index(cust_id)
        similarity_scores = similarity_matrix[idx]
        
        similar_indices = np.argsort(similarity_scores)[-4:-1][::-1]
        
        lookalikes = [
            (customer_ids[idx], float(round(similarity_scores[idx], 3))) 
            for idx in similar_indices
        ]
        lookalike_map[cust_id] = lookalikes

    # Create output DataFrame
    lookalike_df = pd.DataFrame({
        "CustomerID": list(lookalike_map.keys()),
        "Lookalikes": list(lookalike_map.values())
    })

    # Save 
    lookalike_df.to_csv("Lookalike.csv", index=False)
    print("Lookalike recommendations successfully saved to Lookalike.csv")
    
    print("\nFirst few recommendations:")
    print(lookalike_df.head())

    # Print some statistics for verification
    print("\nFeatures used in similarity calculation:")
    print(features.columns.tolist())
    print("\nNumber of features:", features.shape[1])
    print("Number of customers processed:", len(target_customers))

except Exception as e:
    print(f"An error occurred: {e}")

Lookalike recommendations successfully saved to Lookalike.csv

First few recommendations:
  CustomerID                                        Lookalikes
0      C0001  [(C0192, 0.967), (C0152, 0.963), (C0184, 0.962)]
1      C0002  [(C0159, 0.986), (C0134, 0.979), (C0106, 0.964)]
2      C0003  [(C0031, 0.974), (C0129, 0.973), (C0195, 0.963)]
3      C0004  [(C0113, 0.992), (C0104, 0.978), (C0102, 0.977)]
4      C0005  [(C0007, 0.994), (C0140, 0.951), (C0159, 0.909)]

Features used in similarity calculation:
['Total_Spend', 'Avg_Transaction_Value', 'Total_Quantity', 'Avg_Quantity', 'Unique_Products', 'Transaction_Count', 'Days_on_Platform', 'Books', 'Clothing', 'Electronics', 'Home Decor', 'Region_Asia', 'Region_Europe', 'Region_North America', 'Region_South America']

Number of features: 15
Number of customers processed: 20
