Data Preprocessing

In [None]:
#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
customers = pd.read_csv('./Customers.csv')
products = pd.read_csv('./Products.csv')
transactions = pd.read_csv('./Transactions.csv')

# Merge customer data with transaction history
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'ProductID': 'nunique',
    'Quantity': 'sum',
}).reset_index()

customer_data = customers.merge(customer_transactions, on='CustomerID', how='left')

# One-hot encode the Region column
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

customer_data['SignupYear'] = pd.to_datetime(customer_data['SignupDate']).dt.year

In [None]:
# Check for NaN values
print(pd.isnull(customer_data[['TotalValue', 'ProductID', 'Quantity', 'SignupYear']]).sum())


TotalValue    1
ProductID     1
Quantity      1
SignupYear    0
dtype: int64


In [None]:
# Drop rows with missing values
customer_data.dropna(subset=['TotalValue', 'ProductID', 'Quantity'], inplace=True)

In [None]:
# Check if there are any missing values
print(customer_data.isnull().sum())

CustomerID              0
CustomerName            0
SignupDate              0
TotalValue              0
ProductID               0
Quantity                0
Region_Europe           0
Region_North America    0
Region_South America    0
SignupYear              0
dtype: int64


Feature Scaling

In [None]:
# Scale features to bring them into the same range
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[['TotalValue', 'ProductID', 'Quantity', 'SignupYear']])

# Include the one-hot encoded region features
final_features = np.hstack([scaled_features, customer_data[['Region_Europe', 'Region_North America','Region_South America']].values])

Calculate Similarity

In [None]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(final_features)

# Convert the similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

Top 3 Lookalike Customers

In [None]:
# Create the lookalike dictionary with separate columns for each lookalike
lookalike_dict = {}

for cust_id in customer_data['CustomerID'][:20]:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
    # Store lookalikes and their similarity scores as separate columns
    lookalike_dict[cust_id] = {
        'Lookalike_CustomerID_1': similar_customers.index[0], 'Similarity_Score_1': similar_customers.values[0],
        'Lookalike_CustomerID_2': similar_customers.index[1], 'Similarity_Score_2': similar_customers.values[1],
        'Lookalike_CustomerID_3': similar_customers.index[2], 'Similarity_Score_3': similar_customers.values[2],
    }

# Convert the lookalike dictionary into a DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index')

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')

lookalike_df.head()

Unnamed: 0,Lookalike_CustomerID_1,Similarity_Score_1,Lookalike_CustomerID_2,Similarity_Score_2,Lookalike_CustomerID_3,Similarity_Score_3
C0001,C0174,0.988258,C0011,0.987004,C0152,0.981079
C0002,C0027,0.958111,C0159,0.954309,C0005,0.943368
C0003,C0190,0.951502,C0031,0.905475,C0076,0.903023
C0004,C0113,0.988197,C0102,0.970619,C0104,0.970492
C0005,C0159,0.997267,C0007,0.988055,C0002,0.943368
