In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [4]:
print("Customers Dataset:")
display(customers.head())

print("Products Dataset:")
display(products.head())

print("Transactions Dataset:")
display(transactions.head())

Customers Dataset:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


Products Dataset:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


Transactions Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


# **Look Alike Model**

Data preparation

In [5]:
# Merge Customers and Transactions datasets
customer_transactions = transactions.merge(customers, on='CustomerID', how='inner')

# Aggregate customer transaction data
customer_profile = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    region=('Region', 'first')  # Assuming one region per customer
).reset_index()

# Add most purchased product categories to the profile
category_data = transactions.merge(products, on='ProductID', how='left')
top_category = category_data.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()
top_category = top_category.sort_values(['CustomerID', 'Quantity'], ascending=[True, False]).drop_duplicates('CustomerID')
customer_profile = customer_profile.merge(top_category[['CustomerID', 'Category']], on='CustomerID', how='left')

print("\nCustomer Profile:")
display(customer_profile.head())



Customer Profile:


Unnamed: 0,CustomerID,total_spent,total_transactions,avg_transaction_value,region,Category
0,C0001,3354.52,5,670.904,South America,Electronics
1,C0002,1862.74,4,465.685,Asia,Home Decor
2,C0003,2725.38,4,681.345,South America,Home Decor
3,C0004,5354.88,8,669.36,South America,Home Decor
4,C0005,2034.24,3,678.08,Asia,Electronics


Feature Engineering

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Encode categorical variables (Region and Category)
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_profile[['region', 'Category']]).toarray()

# Normalize numerical columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['total_spent', 'total_transactions', 'avg_transaction_value']])

# Combine all features into a single feature matrix
import numpy as np
features = np.hstack((scaled_features, encoded_features))
print("Feature Matrix Shape:", features.shape)


Feature Matrix Shape: (199, 11)


Calculate Similarities

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

# Create a dictionary to store top 3 similar customers for each of the first 20 customers
customer_ids = customer_profile['CustomerID'].values
lookalike_dict = {}

for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_customers = list(enumerate(similarity_matrix[i]))
    similar_customers = sorted(similar_customers, key=lambda x: -x[1])  # Sort by similarity score
    top_3 = [(customer_ids[j], round(score, 3)) for j, score in similar_customers[1:4]]  # Exclude self-match
    lookalike_dict[cust_id] = top_3

print("Lookalike Recommendations for First 20 Customers:")
for k, v in lookalike_dict.items():
    print(f"Customer {k}: {v}")


Lookalike Recommendations for First 20 Customers:
Customer C0001: [('C0190', 0.968), ('C0091', 0.88), ('C0192', 0.815)]
Customer C0002: [('C0128', 0.941), ('C0097', 0.899), ('C0178', 0.863)]
Customer C0003: [('C0133', 0.993), ('C0052', 0.985), ('C0152', 0.926)]
Customer C0004: [('C0113', 0.983), ('C0108', 0.981), ('C0012', 0.971)]
Customer C0005: [('C0007', 0.905), ('C0140', 0.87), ('C0130', 0.75)]
Customer C0006: [('C0171', 0.951), ('C0187', 0.945), ('C0168', 0.82)]
Customer C0007: [('C0140', 0.976), ('C0005', 0.905), ('C0080', 0.759)]
Customer C0008: [('C0109', 0.972), ('C0098', 0.937), ('C0156', 0.819)]
Customer C0009: [('C0010', 0.976), ('C0198', 0.952), ('C0062', 0.931)]
Customer C0010: [('C0009', 0.976), ('C0111', 0.971), ('C0062', 0.947)]
Customer C0011: [('C0048', 0.997), ('C0137', 0.961), ('C0169', 0.92)]
Customer C0012: [('C0108', 0.988), ('C0004', 0.971), ('C0104', 0.966)]
Customer C0013: [('C0099', 0.986), ('C0165', 0.939), ('C0141', 0.839)]
Customer C0014: [('C0060', 0.976

 Save Results

In [8]:
# Save the results to a Lookalike.csv file
import csv

with open('Subramanya_Naik_Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Recommendations'])

    for cust_id, recommendations in lookalike_dict.items():
        writer.writerow([cust_id, recommendations])

print("\nLookalike recommendations saved to Lookalike.csv")



Lookalike recommendations saved to Lookalike.csv
