In [2]:
!pip install --upgrade pandas




In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
merged_dataset = transactions.merge(customers, on='CustomerID', how='left')
print(merged_dataset.head())
merged_dataset = pd.merge(merged_dataset, products, on='ProductID', how='left')

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03  
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04  
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04  
3      601.36  300.68  Travis Campbell  South America  2024-04-11  
4      902.04  300.68    Timothy Perez         Europe  2022-03-15  


Aggregate data per customers

In [6]:
print(merged_dataset.head(20))

   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0         T00001      C0199      P067  2024-08-25 12:38:23         1   
1         T00112      C0146      P067  2024-05-27 22:23:54         1   
2         T00166      C0127      P067  2024-04-25 07:38:55         1   
3         T00272      C0087      P067  2024-03-26 22:55:37         2   
4         T00363      C0070      P067  2024-03-21 15:10:10         3   
5         T00442      C0188      P067  2024-12-26 14:40:03         1   
6         T00490      C0195      P067  2024-11-24 11:49:48         3   
7         T00536      C0008      P067  2024-09-22 06:13:59         1   
8         T00564      C0157      P067  2024-12-07 17:57:40         3   
9         T00631      C0130      P067  2024-05-14 23:14:59         2   
10        T00727      C0051      P067  2024-01-20 04:52:14         3   
11        T00729      C0075      P067  2024-10-07 06:38:36         2   
12        T00797      C0155      P067  2024-10-04 20:42:53      

In [7]:
print(merged_dataset['SignupDate'].isna())


0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: SignupDate, Length: 1000, dtype: bool


In [28]:
customer_profile = merged_dataset.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique'),
    # category_preference=('Category', lambda x: x.value_counts().idxmax() if len(x) > 0 else None)
).reset_index()
customer_profile = pd.merge(customer_profile, customers[['CustomerID', 'Region']], on='CustomerID', how='left')


In [29]:
# merged_dataset['SignupDate'] = pd.to_datetime(merged_dataset['SignupDate'])
# current_data = pd.Timestamp.now()
# merged_dataset['Tenure'] = (current_data - merged_dataset['SignupDate']).dt.days
# customer_profile = pd.merge(customer_profile, merged_dataset[['CustomerID', 'Tenure']], on='CustomerID', how='left')

In [30]:
print(merged_dataset['Tenure'].isnull().sum())

0


In [31]:
print(merged_dataset['Tenure'].head())

0     786
1     145
2     298
3     291
4    1049
Name: Tenure, dtype: int64


Normalise Numeric Features

In [32]:
scaler = StandardScaler()
numeric_features = ['total_spend', 'purchase_frequency', 'unique_products']
customer_profile[numeric_features] = scaler.fit_transform(customer_profile[numeric_features])

We will exclude 'CustomerID','Region'and 'Category' for similarity calculation

In [33]:
features = customer_profile[['total_spend', 'purchase_frequency', 'unique_products']]
similarity_matrix = cosine_similarity(features)

Top 3 lookalike recommendation for fuest 20 customers

In [34]:
top_lookalikes = {}
for cust_id in customer_profile['CustomerID'].head(20):
    cust_index = customer_profile[customer_profile['CustomerID'] == cust_id].index[0]

    # Get similarity scores for this customer with all others
    similarity_scores = similarity_matrix[cust_index]

    # Exclude the customer itself from the recommendations (set its similarity score to 0)
    similarity_scores[cust_index] = 0

    # Get the top 3 most similar customers
    similar_customer_indices = similarity_scores.argsort()[-3:][::-1]
    recommended_customers = [(customer_profile.iloc[i]['CustomerID'], similarity_scores[i]) for i in similar_customer_indices]
    top_lookalikes[cust_id] = recommended_customers

# Save the results to a CSV file
lookalike_df = pd.DataFrame(columns=['CustomerID', 'Lookalikes'])
for cust_id, recommendations in top_lookalikes.items():
    lookalike_df = pd.concat([lookalike_df, pd.DataFrame({
        'CustomerID': cust_id,
        'Lookalikes': recommendations
    })], ignore_index=True)
# df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

lookalike_df.to_csv('Lookalike.csv', index=False)
print(lookalike_df.head())


  CustomerID                   Lookalikes
0      C0001  (C0137, 0.9963323425457112)
1      C0001  (C0152, 0.9869046482018122)
2      C0001  (C0056, 0.9304265739853477)
3      C0002  (C0029, 0.9997577919968235)
4      C0002  (C0199, 0.9993469209744867)
