In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [4]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [10]:
# Merge transactions with customers
merged_data = pd.merge(transactions, customers, on="CustomerID")

# Merge with products
merged_data = pd.merge(merged_data, products, on="ProductID")

In [13]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [12]:
customer_profiles = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spend
    "TransactionID": "count",  # Number of transactions
    "Quantity": "sum",  # Total quantity purchased
    "Region": "first",  # Region
    "Category": lambda x: x.mode()[0]  # Most purchased category
}).reset_index()


In [15]:
# One-hot encode categorical features
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region", "Category"])

In [16]:
#Normalized features
scaler = MinMaxScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles.drop("CustomerID", axis=1))

In [17]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_profiles_scaled)

In [18]:
# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])

In [19]:
# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return list(zip(similar_customers.index, similar_customers.values))

In [20]:
# Generate recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customer_profiles["CustomerID"].iloc[:20]:
    lookalike_results[customer_id] = get_top_similar_customers(customer_id, similarity_df)

In [22]:
with open("Deepanshu_Jindal_Lookalike.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["CustomerID", "SimilarCustomers"])
    for customer_id, similar_customers in lookalike_results.items():
        writer.writerow([customer_id, similar_customers])