In [27]:
#importing the neccessary libraries required

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import os

In [10]:
# #Loading the customer , products and trasaction  data into panda dataframe 
# customers_df = pd.read_csv("../Customers.csv")
# products_df = pd.read_csv("../Products.csv")
# transactions_df = pd.read_csv("../Transactions.csv")

In [11]:
#showing the first 5 result of each dataset
print("Customers Data:")
display(customers_df.head())

print("Products Data:")
display(products_df.head())

print("Transactions Data:")
display(transactions_df.head())

Customers Data:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


Products Data:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


Transactions Data:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [12]:
#changing the data fomrat to more usable format 

customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [13]:
customer_transaction_history = transactions_df.groupby('CustomerID').agg(
    transaction_count=('TransactionID', 'count'),
    total_spend=('TotalValue', 'sum'),
    avg_order_value=('TotalValue', 'mean'),
    avg_quantity=('Quantity', 'mean')
).reset_index()

# Merge customer demographics with transaction history
customer_profile = customers_df.merge(customer_transaction_history, on='CustomerID', how='left')

# Display the customer profile
print("Customer Profile:")
display(customer_profile.head())

Customer Profile:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,transaction_count,total_spend,avg_order_value,avg_quantity
0,C0001,Lawrence Carroll,South America,2022-07-10,5.0,3354.52,670.904,2.4
1,C0002,Elizabeth Lutz,Asia,2022-02-13,4.0,1862.74,465.685,2.5
2,C0003,Michael Rivera,South America,2024-03-07,4.0,2725.38,681.345,3.5
3,C0004,Kathleen Rodriguez,South America,2022-10-09,8.0,5354.88,669.36,2.875
4,C0005,Laura Weber,Asia,2022-08-15,3.0,2034.24,678.08,2.333333


In [15]:
product_categories = products_df[['ProductID', 'Category']].drop_duplicates()
product_categories = product_categories.merge(transactions_df, on='ProductID')
category_purchase_count = product_categories.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)

# Display category purchase counts
print("Category Purchase Counts:")
display(category_purchase_count.head())

Category Purchase Counts:


Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,1,0,3,1
C0002,0,2,0,2
C0003,0,1,1,2
C0004,3,0,2,3
C0005,0,0,2,1


In [17]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(category_purchase_count)
print("Standardized Category Purchase Data:")
display(pd.DataFrame(scaled_data, columns=category_purchase_count.columns).head())

# Step 6: Similarity Calculation (Cosine Similarity)

# Calculate cosine similarity between customers based on their category purchase data
similarity_matrix = cosine_similarity(scaled_data)

Standardized Category Purchase Data:


Category,Books,Clothing,Electronics,Home Decor
0,-0.321113,-1.041606,1.550878,-0.221044
1,-1.221132,0.776636,-1.148463,0.676665
2,-1.221132,-0.132485,-0.248683,0.676665
3,1.478927,-1.041606,0.651097,1.574374
4,-1.221132,-1.041606,0.651097,-0.221044


In [19]:
print("Cosine Similarity Matrix (First 5 Customers):")
similarity_df = pd.DataFrame(similarity_matrix[:5, :5], columns=category_purchase_count.index[:5], index=category_purchase_count.index[:5])
display(similarity_df)

# Step 7: Generate Lookalike Recommendations

# Map top 3 most similar customers for each of the first 20 customers
top_3_customers = {}

Cosine Similarity Matrix (First 5 Customers):


CustomerID,C0001,C0002,C0003,C0004,C0005
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0001,1.0,-0.625189,-0.001888,0.268185,0.760936
C0002,-0.625189,1.0,0.760737,-0.469891,-0.062616
C0003,-0.001888,0.760737,1.0,-0.216034,0.529853
C0004,0.268185,-0.469891,-0.216034,1.0,-0.148678
C0005,0.760936,-0.062616,0.529853,-0.148678,1.0


In [20]:
for idx, customer_id in enumerate(customer_profile['CustomerID'].head(20)):
    # Get the similarity scores for the current customer (excluding itself)
    similar_customers = similarity_matrix[idx].argsort()[-4:-1]  # Top 3 excluding the customer itself
    similar_scores = similarity_matrix[idx][similar_customers]
    
    # Store the top 3 similar customers and their similarity scores
    top_3_customers[customer_id] = [(customer_profile['CustomerID'].iloc[i], score) for i, score in zip(similar_customers, similar_scores)]

In [21]:
print("Top 3 Similar Customers for the First 5 Customers:")
for customer_id, recommendations in top_3_customers.items():
    print(f"Customer {customer_id}:")
    for recommended_customer, score in recommendations:
        print(f"  - Similar Customer {recommended_customer} with similarity score: {score:.4f}")

Top 3 Similar Customers for the First 5 Customers:
Customer C0001:
  - Similar Customer C0035 with similarity score: 0.9135
  - Similar Customer C0146 with similarity score: 0.9135
  - Similar Customer C0069 with similarity score: 0.9501
Customer C0002:
  - Similar Customer C0103 with similarity score: 0.8941
  - Similar Customer C0134 with similarity score: 0.9412
  - Similar Customer C0002 with similarity score: 1.0000
Customer C0003:
  - Similar Customer C0003 with similarity score: 1.0000
  - Similar Customer C0031 with similarity score: 1.0000
  - Similar Customer C0158 with similarity score: 1.0000
Customer C0004:
  - Similar Customer C0090 with similarity score: 0.8995
  - Similar Customer C0193 with similarity score: 0.8995
  - Similar Customer C0047 with similarity score: 0.9329
Customer C0005:
  - Similar Customer C0120 with similarity score: 0.8793
  - Similar Customer C0007 with similarity score: 1.0000
  - Similar Customer C0005 with similarity score: 1.0000
Customer C0006

In [28]:
# Step 8: Save the results to Lookalike.csv (with CustomerID, Lookalike CustomerID, and Similarity Score)

# Prepare the data for saving into CSV
lookalike_data = []

for customer_id, recommendations in top_3_customers.items():
    for recommended_customer, score in recommendations:
        lookalike_data.append([customer_id, recommended_customer, score])

# Convert to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "LookalikeCustomerID", "SimilarityScore"])

# Save the results to 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

# Inform the user that the CSV has been saved
print("Lookalike recommendations saved to 'Lookalike.csv' with CustomerID, LookalikeCustomerID, and SimilarityScore.")

print("Current working directory:", os.getcwd())


Lookalike recommendations saved to 'Lookalike.csv' with CustomerID, LookalikeCustomerID, and SimilarityScore.
Current working directory: C:\Users\Abhin\OneDrive\Desktop\Workspace
