In [3]:
#TASK 2: Lookalike Model
#importing libraries like pandas, numpy, sklearn and date time for data manipulation, numerical calculations, machine learning and handline date-time operations respectively.

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [4]:
#Loading the dataset into variables

customers=pd.read_csv('../Customers.csv')
products=pd.read_csv('../Products.csv')
transactions=pd.read_csv('../Transactions.csv')

In [5]:
#Merging the dataset
#Merging transactions with customers based on CustomerID
merged_data = pd.merge(transactions, customers, on="CustomerID")

#Merging merged_data with products based on ProductID
merged_data = pd.merge(merged_data, products, on="ProductID", suffixes=('_transaction','_product'))

print(merged_data)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue  Price_transaction          CustomerName         Region  \
0        300.68             300.68        Andrea

In [6]:
#Applying feature engineering which helps in transforming raw data into features that better represent underlying problems to ML models. This steps results in giving more accurate results.

#Creating customer level features
customer_features = merged_data.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),  # It will give total spending per customer
    num_transactions=("TransactionID", "nunique"),  # It will give total Number of transactions
    avg_transaction_value=("TotalValue", "mean"),  # It will give average transaction value
    preferred_category=("Category", lambda x: x.mode()[0]),  #It will give  Most purchased category
    days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
).reset_index()


#Using One-Hot encoding for the preferred category
customer_features = pd.get_dummies(customer_features, columns=["preferred_category"], drop_first=True)

  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_signup=("SignupDate", lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.mean())  #It will give number of Days since signup
  days_since_

In [7]:
#Normalizing Features using MinMaxScaler. MinMaxScaler is a normalization technique that scales and transforms each feature to a given range (by default, between 0 and 1).
scaler = MinMaxScaler()
feature_columns = [col for col in customer_features.columns if col != "CustomerID"]
customer_features_scaled = scaler.fit_transform(customer_features[feature_columns])

print(customer_features_scaled)

[[0.30894178 0.4        0.47433644 ... 0.         1.         0.        ]
 [0.16809501 0.3        0.30894039 ... 1.         0.         0.        ]
 [0.24954138 0.3        0.48275135 ... 0.         0.         1.        ]
 ...
 [0.08020292 0.1        0.30912576 ... 1.         0.         0.        ]
 [0.17909816 0.3        0.33242172 ... 0.         1.         0.        ]
 [0.44150834 0.4        0.7006598  ... 1.         0.         0.        ]]


In [8]:
#computing similarity scores using cosine similarity between customers based on their scaled features and organizing the results into similarity matrix. Cosine similarity computes how similar two sets of data are regardless of their magnitude.
similarity_matrix = cosine_similarity(customer_features_scaled)
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features["CustomerID"],
    columns=customer_features["CustomerID"]
)

In [9]:
#Finding top 3 similar customers for C0001-C0020 as mentioned in the task.
target_customers = [f"C{i:04d}" for i in range(1, 21)] 
lookalike_results = {}

for customer_id in target_customers:
    if customer_id in similarity_df.index:
        similarities = similarity_df[customer_id]
        similarities = similarities.drop(customer_id) 
        top_3_similar = similarities.nlargest(3).reset_index().values.tolist()
        lookalike_results[customer_id] = top_3_similar
    else:
        # Handle missing customers (e.g., no transactions)
        lookalike_results[customer_id] = [["None", 0.0]] * 3 

In [10]:
# Step 7: Saving the results in a Aaditya_Vijayvargiya_Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "SimilarCustomers": lookalike_results.values()
})
lookalike_df.to_csv("Aaditya_Vijayvargiya_Lookalike.csv", index=False)

print("Aaditya_Vijayvargiya_Lookalike.csv has been generated successfully!")


Aaditya_Vijayvargiya_Lookalike.csv has been generated successfully!
