In [84]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [85]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [86]:
# Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')  
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], errors='coerce') 

merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering

In [87]:
# Aggregate customer-level features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    most_purchased_category=('Category', lambda x: x.mode()[0]),
    most_purchased_product=('ProductName', lambda x: x.mode()[0])
).reset_index()

In [88]:
# Add region and signup duration (days since signup)
customer_features = customer_features.merge(customers[['CustomerID', 'Region', 'SignupDate']], on="CustomerID")
customer_features['days_since_signup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days


# Prepare data for similarity calculation

In [89]:
# One-hot encode categorical variables (Region, Most Purchased Category, Most Purchased Product)
ohe = OneHotEncoder()
categorical_data = ohe.fit_transform(customer_features[['Region', 'most_purchased_category', 'most_purchased_product']]).toarray()

In [90]:
# Scale numerical variables (Total Spending, Num Transactions, etc.)
scaler = StandardScaler()
numerical_data = scaler.fit_transform(customer_features[['total_spending', 'num_transactions', 'avg_transaction_value', 'days_since_signup']])


In [91]:
# Combine all features into a single matrix
feature_matrix = np.hstack((numerical_data, categorical_data))


# Similarity Calculation

In [92]:
# Compute cosine similarity for all customers based on the feature matrix
similarity_matrix = cosine_similarity(feature_matrix)


In [93]:
# Find top 3 lookalikes for each customer
lookalike_map = {}
customer_ids = customer_features['CustomerID'].tolist()

for idx, customer_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [customer_ids[i] for i, _ in similarity_scores[1:4]]  
    top_scores = [score for _, score in similarity_scores[1:4]]
    lookalike_map[customer_id] = list(zip(top_lookalikes, top_scores))


In [94]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame([{
    'cust_id': cust_id,
    'lookalikes': lookalikes
} for cust_id, lookalikes in lookalike_map.items()])


In [95]:
# Convert the lookalike data to string format
lookalike_df['lookalikes'] = lookalike_df['lookalikes'].apply(lambda x: str(x))


In [96]:
# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Devyansh_Malhotra_Lookalike.csv', index=False)


In [97]:
# Display the top 3 lookalikes for the first 20 customers (C0001 to C0020)
lookalike_subset = lookalike_df[lookalike_df['cust_id'].isin(customer_ids[:20])]
print(lookalike_subset)


   cust_id                                         lookalikes
0    C0001  [('C0184', np.float64(0.8761215291484783)), ('...
1    C0002  [('C0134', np.float64(0.8194219742600629)), ('...
2    C0003  [('C0076', np.float64(0.8773668040707575)), ('...
3    C0004  [('C0165', np.float64(0.8568306366012264)), ('...
4    C0005  [('C0007', np.float64(0.7682562421057125)), ('...
5    C0006  [('C0187', np.float64(0.9174461063161405)), ('...
6    C0007  [('C0140', np.float64(0.859996570815779)), ('C...
7    C0008  [('C0065', np.float64(0.697001918948516)), ('C...
8    C0009  [('C0010', np.float64(0.7861610722440849)), ('...
9    C0010  [('C0198', np.float64(0.7886331415364208)), ('...
10   C0011  [('C0153', np.float64(0.6903934653854227)), ('...
11   C0012  [('C0195', np.float64(0.7591833626184085)), ('...
12   C0013  [('C0087', np.float64(0.7324397704124403)), ('...
13   C0014  [('C0151', np.float64(0.850056310540527)), ('C...
14   C0015  [('C0036', np.float64(0.7743811202826798)), ('...
15   C00