In [1]:
# Importing Libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Loading the datasets
transactions = pd.read_csv('Transactions.csv')
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

In [3]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [4]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [6]:
# Merging datasets
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

In [7]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


# Data Preprocessing

In [8]:
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])

In [9]:
merged_data = merged_data.drop(columns=['Price_y'])

In [10]:
merged_data = merged_data.rename(columns={'Price_x': 'Price'})

In [11]:
merged_data = merged_data.iloc[:,[0,1,2,3,11,9,10,7,8,4,6,5]]

In [12]:
merged_data.dtypes

TransactionID              object
CustomerID                 object
ProductID                  object
TransactionDate    datetime64[ns]
SignupDate         datetime64[ns]
CustomerName               object
Region                     object
ProductName                object
Category                   object
Quantity                    int64
Price                     float64
TotalValue                float64
dtype: object

In [13]:
# Feature Extraction
# Aggregate transaction data per customer
customer_feature = merged_data.groupby('CustomerID').agg(
    total_spent = ('TotalValue','sum'),
    avg_quantity = ('Quantity','mean'),
    unique_products = ('ProductID', pd.Series.nunique),
).reset_index()

In [14]:
# One hot Encoding product categories per customer
category_data = pd.get_dummies(merged_data[['CustomerID','Category']],columns=['Category'])
category_data = category_data.groupby('CustomerID').sum().reset_index()

In [15]:
category_data.head()

Unnamed: 0,CustomerID,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,1,0,3,1
1,C0002,0,2,0,2
2,C0003,0,1,1,2
3,C0004,3,0,2,3
4,C0005,0,0,2,1


In [16]:
# Merging customer features with category data
customer_feature = customer_feature.merge(category_data, on='CustomerID')

In [17]:
customer_feature.head()

Unnamed: 0,CustomerID,total_spent,avg_quantity,unique_products,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,2.4,5,1,0,3,1
1,C0002,1862.74,2.5,4,0,2,0,2
2,C0003,2725.38,3.5,4,0,1,1,2
3,C0004,5354.88,2.875,8,3,0,2,3
4,C0005,2034.24,2.333333,3,0,0,2,1


In [18]:
# Scaling the dataset using Standard Scaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_feature.drop('CustomerID',axis=1))

# Compute of Cosine Similarity

In [19]:
similarity_metrics = cosine_similarity(features_scaled)

In [20]:
similarity_metrics

array([[ 1.        , -0.54457456, -0.09090343, ...,  0.13763243,
         0.60487487, -0.42736947],
       [-0.54457456,  1.        ,  0.50566263, ...,  0.40535014,
         0.28046726,  0.11661954],
       [-0.09090343,  0.50566263,  1.        , ..., -0.1669998 ,
         0.31734929,  0.51068168],
       ...,
       [ 0.13763243,  0.40535014, -0.1669998 , ...,  1.        ,
         0.49522936, -0.50496471],
       [ 0.60487487,  0.28046726,  0.31734929, ...,  0.49522936,
         1.        , -0.54406583],
       [-0.42736947,  0.11661954,  0.51068168, ..., -0.50496471,
        -0.54406583,  1.        ]])

# Preparing Recommendations

In [21]:
customer_ids = customer_feature['CustomerID'].tolist()

In [22]:
recommendation= {}

In [23]:
for idx,customerid in enumerate(customer_ids[:]):
    similarity_scores = list(enumerate(similarity_metrics[idx]))
    similar_customers = sorted(similarity_scores,key=lambda x:x[1],reverse=True)
    top_lookalikes = [(customer_ids[i], score) for i, score in similar_customers if i != idx][:3]
    recommendation[customerid] = top_lookalikes

In [24]:
lookalike_data = []
for customer,lookalikes in recommendation.items():
    lookalike_data.append(
        {
            "CustomerID": customer,
            "Lookalikes": str(lookalikes)
        }
    )

lookalikes_df = pd.DataFrame(lookalike_data)
# Saving Lookalike to Lookalikes.csv
lookalikes_df.to_csv('Divyansh_Rajput_Lookalikes.csv',index=False)