# Task 2: Lookalike Model


In [457]:
#Importing Libraries
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [458]:
#Load Data Sets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

###### Data Preprocessing

In [460]:
#merge all the datasets into single dataframe
customer_transaction=pd.merge(transactions, customers, on='CustomerID', how='left')
full_data=pd.merge(customer_transaction, products, on='ProductID', how='left')

In [461]:
full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [462]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [463]:
full_data.isnull().sum()

TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price_x            0
CustomerName       0
Region             0
SignupDate         0
ProductName        0
Category           0
Price_y            0
dtype: int64

###### Type conversions

In [465]:
full_data['TransactionDate']=pd.to_datetime(full_data['TransactionDate'])
full_data['SignupDate']=pd.to_datetime(full_data['SignupDate'])
full_data['Region']=full_data['Region'].astype('category')
full_data['Category']=full_data['Category'].astype('category')
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   TransactionID    1000 non-null   object        
 1   CustomerID       1000 non-null   object        
 2   ProductID        1000 non-null   object        
 3   TransactionDate  1000 non-null   datetime64[ns]
 4   Quantity         1000 non-null   int64         
 5   TotalValue       1000 non-null   float64       
 6   Price_x          1000 non-null   float64       
 7   CustomerName     1000 non-null   object        
 8   Region           1000 non-null   category      
 9   SignupDate       1000 non-null   datetime64[ns]
 10  ProductName      1000 non-null   object        
 11  Category         1000 non-null   category      
 12  Price_y          1000 non-null   float64       
dtypes: category(2), datetime64[ns](2), float64(3), int64(1), object(5)
memory usage: 88.4+ KB


###### Feature Engineering

In [467]:
# Pivot the data to create a customer-product matrix
customer_product_matrix = full_data.pivot_table(index="CustomerID", columns="ProductID", values="Quantity", fill_value=0)


In [468]:
# Extract demographic features (e.g., SignupYear and Region) and encode categorical features
customers['SignupYear'] = pd.to_datetime(customers['SignupDate']).dt.year
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')

demographics = customers.set_index("CustomerID")[["SignupYear", "Region"]]
demographics["Region"] = demographics["Region"].astype("category").cat.codes

# Combine the customer-product matrix with demographic features
combined_data = customer_product_matrix.join(demographics, how="inner")

In [469]:
# Normalize the combined data
scaler = StandardScaler()
normalized_data = scaler.fit_transform(combined_data)


###### Model Selection

In [471]:
# Fit the KNN model
knn = NearestNeighbors(n_neighbors=20, metric="cosine")  # 4 because we exclude the input customer from results
knn.fit(normalized_data)

In [472]:
# Find the top 3 lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customers["CustomerID"][:20]:
    if customer_id in combined_data.index:
        distances, indices = knn.kneighbors([normalized_data[combined_data.index.get_loc(customer_id)]])
        similar_customers = combined_data.index[indices.flatten()[1:]].tolist()  # Exclude the customer itself
        similarity_scores = [round(1 - dist, 2) for dist in distances.flatten()[1:4]]  # Convert distances to similarity
        lookalikes[customer_id] = list(zip(similar_customers, similarity_scores))

In [473]:
# Convert the lookalikes dictionary to a DataFrame
lookalikes_df = pd.DataFrame({"CustomerID": list(lookalikes.keys()), "Lookalikes": list(lookalikes.values())})

# Save the lookalikes to a CSV file
lookalikes_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model created and Lookalike.csv file generated.")
lookalikes_df.head(20)





Lookalike model created and Lookalike.csv file generated.


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0194, 0.38), (C0104, 0.37), (C0199, 0.34)]"
1,C0002,"[(C0091, 0.41), (C0030, 0.38), (C0173, 0.32)]"
2,C0003,"[(C0181, 0.48), (C0134, 0.43), (C0144, 0.42)]"
3,C0004,"[(C0070, 0.38), (C0175, 0.3), (C0025, 0.26)]"
4,C0005,"[(C0023, 0.48), (C0096, 0.46), (C0119, 0.36)]"
5,C0006,"[(C0058, 0.4), (C0040, 0.38), (C0196, 0.34)]"
6,C0007,"[(C0079, 0.61), (C0118, 0.45), (C0020, 0.4)]"
7,C0008,"[(C0144, 0.31), (C0028, 0.28), (C0165, 0.26)]"
8,C0009,"[(C0140, 0.54), (C0162, 0.48), (C0072, 0.44)]"
9,C0010,"[(C0094, 0.48), (C0143, 0.37), (C0092, 0.34)]"


In [474]:
import pandas as pd

# Define the function to process the lookalikes
def process_lookalikes(row):
    lookalike_list = row  # Already in list format
    customer_ids = [lookalike[0] for lookalike in lookalike_list]
    similarity_scores = [lookalike[1] for lookalike in lookalike_list]
    return pd.Series(customer_ids + similarity_scores)

# Assuming the 'lookalikes_df' has been created already
lookalikes_df[['Lookalike1_ID', 'Lookalike2_ID', 'Lookalike3_ID', 'Lookalike1_Score', 
               'Lookalike2_Score', 'Lookalike3_Score']] = lookalikes_df['Lookalikes'].apply(process_lookalikes)

# Drop the original 'Lookalikes' column
lookalikes_df.drop(columns=['Lookalikes'], inplace=True)

# Save the updated dataframe to CSV
lookalikes_df.to_csv('Updated_Lookalikes.csv', index=False)

print("Updated Lookalikes CSV file saved successfully!")
lookalikes_df.head(20)

Updated Lookalikes CSV file saved successfully!


Unnamed: 0,CustomerID,Lookalike1_ID,Lookalike2_ID,Lookalike3_ID,Lookalike1_Score,Lookalike2_Score,Lookalike3_Score
0,C0001,C0194,C0104,C0199,0.38,0.37,0.34
1,C0002,C0091,C0030,C0173,0.41,0.38,0.32
2,C0003,C0181,C0134,C0144,0.48,0.43,0.42
3,C0004,C0070,C0175,C0025,0.38,0.3,0.26
4,C0005,C0023,C0096,C0119,0.48,0.46,0.36
5,C0006,C0058,C0040,C0196,0.4,0.38,0.34
6,C0007,C0079,C0118,C0020,0.61,0.45,0.4
7,C0008,C0144,C0028,C0165,0.31,0.28,0.26
8,C0009,C0140,C0162,C0072,0.54,0.48,0.44
9,C0010,C0094,C0143,C0092,0.48,0.37,0.34
