# IMPORTING TO BE USEFUL LIBRARIES

In [1]:
# importing useful libraries at once
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# importing all provided files at once
customers = pd.read_csv(r"C:\Users\Acer\OneDrive\Desktop\ZEOTAP ASSIGNMENT\Customers (1).csv")
products = pd.read_csv(r"C:\Users\Acer\OneDrive\Desktop\ZEOTAP ASSIGNMENT\Products (1).csv")
transactions = pd.read_csv(r"C:\Users\Acer\OneDrive\Desktop\ZEOTAP ASSIGNMENT\Transactions.csv")

In [3]:
# merging two files transactions and customers
customers_transactions = pd.merge(customers, transactions, on='CustomerID', how='inner')

In [4]:
# merging preveosly merged files with remaining file
full_data = pd.merge(customers_transactions, products, on='ProductID', how='inner')

In [5]:
# renaming all merged data for easier coding
df = full_data

In [6]:
# removing any signs in column names
customers.columns = customers.columns.str.lower().str.replace(' ', '_')
products.columns = products.columns.str.lower().str.replace(' ', '_')
transactions.columns = transactions.columns.str.lower().str.replace(' ', '_')

In [7]:
# dropping price_y
df = df.drop(columns=['Price_y'])

In [8]:
# renaming Price_x
df = df.rename(columns={'Price_x': 'Price'})

In [9]:
# changing to date time format
df['SignupDate'] = pd.to_datetime(df['SignupDate'])
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   CustomerID       1000 non-null   object        
 1   CustomerName     1000 non-null   object        
 2   Region           1000 non-null   object        
 3   SignupDate       1000 non-null   datetime64[ns]
 4   TransactionID    1000 non-null   object        
 5   ProductID        1000 non-null   object        
 6   TransactionDate  1000 non-null   datetime64[ns]
 7   Quantity         1000 non-null   int64         
 8   TotalValue       1000 non-null   float64       
 9   Price            1000 non-null   float64       
 10  ProductName      1000 non-null   object        
 11  Category         1000 non-null   object        
dtypes: datetime64[ns](2), float64(2), int64(1), object(7)
memory usage: 101.6+ KB


In [11]:
#changing data types 
df['Price'] = df['Price'].astype(int)
df['Quantity'] = df['Quantity'].astype(float)
df['TotalValue'] = df['TotalValue'].astype(float)

In [12]:
df.head(25)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2.0,114.6,57,SoundWave Cookbook,Books
1,C0019,Brandon Rodriguez,Europe,2023-01-12,T00423,P054,2024-11-08 10:22:51,3.0,171.9,57,SoundWave Cookbook,Books
2,C0038,Jeffrey Perkins,North America,2022-04-16,T00395,P054,2024-03-14 08:10:08,3.0,171.9,57,SoundWave Cookbook,Books
3,C0039,Angela Harris,South America,2024-10-13,T00053,P054,2024-09-30 14:42:16,3.0,171.9,57,SoundWave Cookbook,Books
4,C0047,Samantha Frank,North America,2024-03-22,T00784,P054,2024-08-23 09:06:06,3.0,171.9,57,SoundWave Cookbook,Books
5,C0073,Heidi Johnson,Europe,2022-08-20,T00634,P054,2024-03-20 02:37:13,4.0,229.2,57,SoundWave Cookbook,Books
6,C0083,Christina Stark,South America,2022-04-07,T00581,P054,2024-11-13 04:56:27,2.0,114.6,57,SoundWave Cookbook,Books
7,C0086,Stephanie Peterson,Europe,2022-09-18,T00873,P054,2024-08-02 11:13:00,4.0,229.2,57,SoundWave Cookbook,Books
8,C0124,Lindsay Perez,Europe,2024-08-26,T00203,P054,2024-03-31 16:50:02,4.0,229.2,57,SoundWave Cookbook,Books
9,C0143,Brian Parker,Asia,2024-05-27,T00262,P054,2024-01-30 22:40:57,1.0,57.3,57,SoundWave Cookbook,Books


# CREATING THE LOOKALIKE

In [13]:
# Aggregating transaction data by customer
customer_features = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()
customer_features = pd.merge(customer_features, df, on='CustomerID', how='inner')

In [14]:
# Encoding categorical features (e.g., Region)
customer_features_encoded = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)


In [15]:
# Assuming df is your DataFrame
scaler = MinMaxScaler()

# Selecting numerical columns to normalize
numerical_columns = ['TotalValue', 'Quantity', 'Price']

# Normalizing the columns
df_normalized = df.copy()
df_normalized[numerical_columns] = scaler.fit_transform(df[numerical_columns])


In [16]:
# Computing similarity matrix for normalized numerical features
similarity_matrix = cosine_similarity(df_normalized[numerical_columns])

# Converting similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df['CustomerID'], columns=df['CustomerID'])


In [17]:
# Filtering the first 20 customers
first_20_customers = df['CustomerID'].iloc[0:20]
lookalike_map = {}


In [18]:
# Getting Top 3 Lookalikes for Each Customer
def get_top_lookalikes(customer_id, similarity_df, top_n=20):
    scores = similarity_df.loc[customer_id]
    top_similar = scores.sort_values(ascending=False).iloc[1:top_n+1]
    return list(zip(top_similar.index, top_similar.values))


In [19]:
# Generating top 3 lookalikes for first 20 customers
lookalikes = {}
for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [customer_features['CustomerID'][i] for i, score in similarity_scores[1:4]]  # Exclude the customer itself
    top_3_scores = [score for i, score in similarity_scores[1:4]]
    lookalikes[customer_id] = list(zip(top_3, top_3_scores))

In [20]:
# Creating Lookalike.csv
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalike_list}
    for cust_id, lookalike_list in lookalikes.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Model generated successfully.")

Lookalike Model generated successfully.


In [21]:
lookalike_df.head(4)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0001, 0.9999999999999999), (C0001, 0.999999..."
1,C0002,"[(C0002, 0.9999999999999999), (C0002, 0.999999..."
2,C0003,"[(C0002, 1.0), (C0003, 1.0), (C0003, 1.0)]"
3,C0004,"[(C0004, 1.0000000000000002), (C0004, 1.000000..."


In [22]:
print(lookalike_df)

  CustomerID                                         Lookalikes
0      C0001  [(C0001, 0.9999999999999999), (C0001, 0.999999...
1      C0002  [(C0002, 0.9999999999999999), (C0002, 0.999999...
2      C0003         [(C0002, 1.0), (C0003, 1.0), (C0003, 1.0)]
3      C0004  [(C0004, 1.0000000000000002), (C0004, 1.000000...
