# Task 2 : Building a Lookalike Model

In [27]:
import pandas as pd
import csv
import shutil
from IPython.display import FileLink
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [3]:
# Feature Engineering
customer_product_matrix = pd.pivot_table(transactions, values='Quantity', index='CustomerID', columns='ProductID', fill_value=0)


In [4]:
# Encode categorical features
customer_encoder = LabelEncoder()
customers['RegionEncoded'] = customer_encoder.fit_transform(customers['Region'])

In [5]:
customers['RegionEncoded']

0      3
1      0
2      3
3      3
4      0
      ..
195    1
196    1
197    1
198    1
199    0
Name: RegionEncoded, Length: 200, dtype: int32

In [6]:
product_encoder = LabelEncoder()
products['CategoryEncoded'] = product_encoder.fit_transform(products['Category'])

In [8]:
customer_profiles = pd.merge(customers, customer_product_matrix, on='CustomerID')
customer_profiles = customer_profiles.set_index('CustomerID')
customer_profiles.head()

Unnamed: 0_level_0,CustomerName,Region,SignupDate,RegionEncoded,P001,P002,P003,P004,P005,P006,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,Lawrence Carroll,South America,2022-07-10,3,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
C0002,Elizabeth Lutz,Asia,2022-02-13,0,0,0,0,4,0,0,...,0,0,0,0,2,0,0,0,0,0
C0003,Michael Rivera,South America,2024-03-07,3,0,4,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
C0004,Kathleen Rodriguez,South America,2022-10-09,3,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,0
C0005,Laura Weber,Asia,2022-08-15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
numerical_features = customer_profiles.select_dtypes(include=['number'])
numerical_features

Unnamed: 0_level_0,RegionEncoded,P001,P002,P003,P004,P005,P006,P007,P008,P009,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
C0002,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
C0003,3,0,4,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0004,3,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,3,0,0,0
C0005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0197,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0198,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0199,1,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)
numerical_features = pd.DataFrame(scaled_features, index=numerical_features.index, columns=numerical_features.columns)


In [15]:
scaled_features

array([[ 1.23740234, -0.19155441, -0.19831279, ..., -0.17354437,
        -0.12925978, -0.14746422],
       [-1.41989693, -0.19155441, -0.19831279, ..., -0.17354437,
        -0.12925978, -0.14746422],
       [ 1.23740234, -0.19155441,  6.11596653, ..., -0.17354437,
        -0.12925978, -0.14746422],
       ...,
       [-0.53413051, -0.19155441, -0.19831279, ..., -0.17354437,
        -0.12925978, -0.14746422],
       [-0.53413051, -0.19155441, -0.19831279, ..., -0.17354437,
        -0.12925978, -0.14746422],
       [-1.41989693, -0.19155441, -0.19831279, ..., -0.17354437,
        -0.12925978, -0.14746422]])

In [16]:
numerical_features

Unnamed: 0_level_0,RegionEncoded,P001,P002,P003,P004,P005,P006,P007,P008,P009,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.237402,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,2.779355,-0.200844,-0.173544,-0.12926,-0.147464
C0002,-1.419897,-0.191554,-0.198313,-0.234031,7.479386,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,3.552778,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
C0003,1.237402,-0.191554,6.115967,-0.234031,-0.173045,-0.201759,3.835813,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
C0004,1.237402,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,4.405786,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,4.595304,-0.173544,-0.12926,-0.147464
C0005,-1.419897,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.534131,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
C0197,-0.534131,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
C0198,-0.534131,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464
C0199,-0.534131,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,4.405786,-0.172791,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.259610,-0.200844,-0.173544,-0.12926,-0.147464


In [17]:
#Cosine Similarity
similarity_matrix = cosine_similarity(numerical_features)
similarity_df = pd.DataFrame(similarity_matrix, index=numerical_features.index, columns=numerical_features.index)


In [18]:
#Lookalike Function
def find_lookalikes(customer_id, top_n=3):
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:top_n + 1]
    return [(index, score) for index, score in similar_customers.items()]

In [19]:
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_map[customer_id] = find_lookalikes(customer_id)

In [23]:
with open('Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for customer_id, lookalikes in lookalike_map.items():
        writer.writerow([customer_id, lookalikes])

In [24]:
print(lookalike_map)


{'C0001': [('C0194', 0.4036419572576301), ('C0020', 0.3715836667078235), ('C0104', 0.3540113475205357)], 'C0002': [('C0091', 0.413460676620656), ('C0030', 0.39411064756416886), ('C0173', 0.3142274408430567)], 'C0003': [('C0181', 0.4826867670748109), ('C0134', 0.4472958478735025), ('C0144', 0.4098029908125773)], 'C0004': [('C0070', 0.3773202173901969), ('C0175', 0.29648487891261494), ('C0105', 0.2647192943679412)], 'C0005': [('C0023', 0.4696970738642301), ('C0096', 0.4556866813617802), ('C0055', 0.36341198848103484)], 'C0006': [('C0040', 0.41376927983237477), ('C0058', 0.38910209649126054), ('C0196', 0.36929366062739916)], 'C0007': [('C0079', 0.6061147560305713), ('C0020', 0.4402088092383971), ('C0118', 0.43519306448563627)], 'C0008': [('C0144', 0.3062437113540315), ('C0028', 0.2792056289411528), ('C0165', 0.26752517497068146)], 'C0009': [('C0140', 0.540574816715776), ('C0162', 0.4781376425463588), ('C0072', 0.4423041833766839)], 'C0010': [('C0094', 0.4834860591492268), ('C0143', 0.3788

In [25]:
customer_id = 'C0001'
lookalikes = lookalike_map.get(customer_id)
print(f"Lookalikes for {customer_id}: {lookalikes}")

Lookalikes for C0001: [('C0194', 0.4036419572576301), ('C0020', 0.3715836667078235), ('C0104', 0.3540113475205357)]


In [28]:
shutil.move('Lookalike.csv', 'Lookalike.csv')
FileLink('Lookalike.csv')