In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
df=pd.read_csv("final.csv")

In [None]:
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,ProductName,Category,Price,CustomerName,Region,SignupDate,TotalValue,tnMonth,tWeek,tYear,VariantID
0,T00001,C0199,P067,2024-08-25,1,comfortliving bluetooth speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03,300.68,8,6,2024,1
1,T00166,C0127,P067,2024-04-25,1,comfortliving bluetooth speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04,300.68,4,3,2024,1
2,T00363,C0070,P067,2024-03-21,3,comfortliving bluetooth speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15,902.04,3,3,2024,1
3,T00442,C0188,P067,2024-12-26,1,comfortliving bluetooth speaker,Electronics,300.68,Anna Ball,South America,2022-05-17,300.68,12,3,2024,1
4,T00490,C0195,P067,2024-11-24,3,comfortliving bluetooth speaker,Electronics,300.68,Jeremy Mclaughlin,South America,2024-09-17,902.04,11,6,2024,1


In [None]:
df.shape

(799, 16)

In [None]:
df.isna().sum()

Unnamed: 0,0
TransactionID,0
CustomerID,0
ProductID,0
TransactionDate,0
Quantity,0
ProductName,0
Category,0
Price,0
CustomerName,0
Region,0


In [None]:
df.duplicated().sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    799 non-null    object 
 1   CustomerID       799 non-null    object 
 2   ProductID        799 non-null    object 
 3   TransactionDate  799 non-null    object 
 4   Quantity         799 non-null    int64  
 5   ProductName      799 non-null    object 
 6   Category         799 non-null    object 
 7   Price            799 non-null    float64
 8   CustomerName     799 non-null    object 
 9   Region           799 non-null    object 
 10  SignupDate       799 non-null    object 
 11  TotalValue       799 non-null    float64
 12  tnMonth          799 non-null    int64  
 13  tWeek            799 non-null    int64  
 14  tYear            799 non-null    int64  
 15  VariantID        799 non-null    int64  
dtypes: float64(2), int64(5), object(9)
memory usage: 100.0+ KB


In [None]:
customer_features = df.groupby("CustomerID").agg({
    'TotalValue': 'sum',
    'Category': lambda x: ', '.join(x.unique()),
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()

In [None]:
# One-Hot Encoding of the 'Category' column
category_dummies = customer_features['Category'].str.get_dummies(sep=', ')

In [None]:

#Normalize numerical features like TotalValue and Quantity
scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity', 'Price']] = scaler.fit_transform(
    customer_features[['TotalValue', 'Quantity', 'Price']]
)

In [None]:
# Concatenates normalized numerical features with one-hot-encoded categorical features and Drops the original Category column since it is now represented as dummies.
customer_features = pd.concat([customer_features.drop(['Category'], axis=1), category_dummies], axis=1)

In [None]:
# Similarity Calculation using Cosine Similarity
similarity_matrix = cosine_similarity(customer_features.drop('CustomerID', axis=1))

In [None]:
# Create a dictionary to store top 3 similar customers for each customer
lookalike_map = {}
for i, customer in enumerate(customer_features['CustomerID']):
    # Exclude the customer from their own similarity
    similarity_scores = similarity_matrix[i]
    similarity_scores[i] = -1
    top_3_customers = similarity_scores.argsort()[-3:][::-1]  # Get indices of top 3 customers
    top_3_ids = customer_features['CustomerID'].iloc[top_3_customers].values
    top_3_scores = similarity_scores[top_3_customers]

    lookalike_map[customer] = [(top_3_ids[i], top_3_scores[i]) for i in range(3)]

In [None]:
lookalike_data = []
for customer, lookalikes in lookalike_map.items():
    for lookalike, score in lookalikes:
        lookalike_data.append({'CustomerID': customer, 'LookalikeCustomerID': lookalike, 'SimilarityScore': score})

In [None]:
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [None]:
top_20_customers = customer_features['CustomerID'].iloc[:20]
print(lookalike_df[lookalike_df['CustomerID'].isin(top_20_customers)])

   CustomerID LookalikeCustomerID  SimilarityScore
0       C0001               C0152         0.980050
1       C0001               C0064         0.970109
2       C0001               C0174         0.925625
3       C0002               C0062         0.981976
4       C0002               C0159         0.942390
5       C0002               C0025         0.882559
6       C0003               C0199         0.973237
7       C0003               C0197         0.956384
8       C0003               C0069         0.951251
9       C0004               C0075         0.951575
10      C0004               C0175         0.946733
11      C0004               C0041         0.941113
12      C0005               C0031         0.994161
13      C0005               C0140         0.986380
14      C0005               C0199         0.921235
15      C0006               C0079         0.988241
16      C0006               C0139         0.968505
17      C0006               C0196         0.924248
18      C0007               C00