In [34]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [35]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [36]:
transaction_features = transactions.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    NumTransactions=('TransactionID', 'count')
).reset_index()

print(transaction_features.head())


  CustomerID  TotalSpending  TotalQuantity  NumTransactions
0      C0001        3354.52             12                5
1      C0002        1862.74             10                4
2      C0003        2725.38             14                4
3      C0004        5354.88             23                8
4      C0005        2034.24              7                3


In [37]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days
customers.drop(columns=['SignupDate'], inplace=True)

print(customers.head())


  CustomerID        CustomerName         Region  DaysSinceSignup
0      C0001    Lawrence Carroll  South America              929
1      C0002      Elizabeth Lutz           Asia             1076
2      C0003      Michael Rivera  South America              323
3      C0004  Kathleen Rodriguez  South America              838
4      C0005         Laura Weber           Asia              893


In [38]:
data = pd.merge(customers, transaction_features, on='CustomerID', how='left')

data.fillna({'TotalSpending': 0, 'TotalQuantity': 0, 'NumTransactions': 0}, inplace=True)

print(data.head())


  CustomerID        CustomerName         Region  DaysSinceSignup  \
0      C0001    Lawrence Carroll  South America              929   
1      C0002      Elizabeth Lutz           Asia             1076   
2      C0003      Michael Rivera  South America              323   
3      C0004  Kathleen Rodriguez  South America              838   
4      C0005         Laura Weber           Asia              893   

   TotalSpending  TotalQuantity  NumTransactions  
0        3354.52           12.0              5.0  
1        1862.74           10.0              4.0  
2        2725.38           14.0              4.0  
3        5354.88           23.0              8.0  
4        2034.24            7.0              3.0  


In [39]:
data = pd.get_dummies(data, columns=['Region'], drop_first=True)

scaler = MinMaxScaler()
features = ['TotalSpending', 'TotalQuantity', 'NumTransactions', 'DaysSinceSignup']
data_scaled = pd.DataFrame(scaler.fit_transform(data[features]), columns=features)

data_scaled = pd.concat([data['CustomerID'], data_scaled], axis=1)

print(data_scaled.head())


  CustomerID  TotalSpending  TotalQuantity  NumTransactions  DaysSinceSignup
0      C0001       0.314274        0.37500         0.454545         0.842204
1      C0002       0.174514        0.31250         0.363636         0.979458
2      C0003       0.255332        0.43750         0.363636         0.276377
3      C0004       0.501681        0.71875         0.727273         0.757236
4      C0005       0.190581        0.21875         0.272727         0.808590


In [40]:
similarity_matrix = cosine_similarity(data_scaled.iloc[:, 1:])

similarity_df = pd.DataFrame(similarity_matrix, index=data['CustomerID'], columns=data['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.979079  0.876199  0.949281  0.977401  0.899147   
C0002       0.979079  1.000000  0.775564  0.872770  0.997354  0.793512   
C0003       0.876199  0.775564  1.000000  0.982234  0.759670  0.976013   
C0004       0.949281  0.872770  0.982234  1.000000  0.862698  0.979253   
C0005       0.977401  0.997354  0.759670  0.862698  1.000000  0.793198   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.979250  0.800828  0.969749  0.991004  ...  0.862612  0.991270   
C0002       0.994364  0.680713  0.979502  0.981261  ...  0.749103  0.995565   
C0003       0.769414  0.958622  0.766840  0.878047  ...  0.980063  0.810363   
C0004       0.869371  0.935458  0.870487  0.943591  ...  0.971459  0.902858   
C0005  

In [41]:
lookalike_map = {}
for customer_id in data['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
print(lookalike_df)


                        Lookalike1                   Lookalike2  \
C0001  (C0125, 0.9996682100889586)  (C0063, 0.9982592304498485)   
C0002  (C0029, 0.9997068482550491)  (C0062, 0.9987776005665663)   
C0003   (C0075, 0.998731598604879)  (C0110, 0.9931645622444079)   
C0004  (C0175, 0.9991876508394796)  (C0165, 0.9984434913312458)   
C0005   (C0112, 0.999520405397267)  (C0159, 0.9995202179359585)   
C0006  (C0170, 0.9989884425797053)  (C0163, 0.9980175365428656)   
C0007  (C0080, 0.9994021824291331)  (C0074, 0.9992279323167407)   
C0008  (C0047, 0.9985100976458368)  (C0194, 0.9960128360274051)   
C0009  (C0103, 0.9951732890322363)  (C0014, 0.9900004782420874)   
C0010   (C0134, 0.998801336349185)  (C0160, 0.9970686861496437)   
C0011   (C0122, 0.999476653644304)  (C0107, 0.9994238776112156)   
C0012  (C0065, 0.9996872235330286)  (C0057, 0.9993561319202272)   
C0013   (C0059, 0.999906080335531)  (C0085, 0.9989533925327435)   
C0014  (C0025, 0.9968919556173951)  (C0199, 0.9966433250916736

In [42]:
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')
print("Lookalike.csv saved successfully!")

Lookalike.csv saved successfully!
