In [2]:
#loading datasets and merging them together
import pandas as pd

#load
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge - transactions -> products -> customers
combined = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')
print(combined.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [3]:
#preprocessing and feature engineering
customer_features = combined.groupby('CustomerID').agg({
    'TotalValue': 'sum', 
    'Quantity': 'sum',    
    'Category': lambda x: ','.join(x.unique()),  
}).reset_index()

print("Customer Features:")
print(customer_features.head())  

Customer Features:
  CustomerID  TotalValue  Quantity                         Category
0      C0001     3354.52        12     Books,Home Decor,Electronics
1      C0002     1862.74        10              Home Decor,Clothing
2      C0003     2725.38        14  Home Decor,Clothing,Electronics
3      C0004     5354.88        23     Books,Home Decor,Electronics
4      C0005     2034.24         7           Home Decor,Electronics


In [4]:
# Encoding
customer_features['Category'] = customer_features['Category'].astype('category').cat.codes
print("Customer Features after Encoding Categories:")
print(customer_features.head())  

Customer Features after Encoding Categories:
  CustomerID  TotalValue  Quantity  Category
0      C0001     3354.52        12        14
1      C0002     1862.74        10        50
2      C0003     2725.38        14        52
3      C0004     5354.88        23        14
4      C0005     2034.24         7        53


In [5]:
from sklearn.preprocessing import StandardScaler

# Normalizing numerical features
scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])
print("Customer Features after Normalization:")
print(customer_features.head())  

Customer Features after Normalization:
  CustomerID  TotalValue  Quantity  Category
0      C0001   -0.061701 -0.122033        14
1      C0002   -0.877744 -0.448000        50
2      C0003   -0.405857  0.203934        52
3      C0004    1.032547  1.670787        14
4      C0005   -0.783929 -0.936951        53


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix 
similarity_matrix = cosine_similarity(customer_features[['TotalValue', 'Quantity', 'Category']])

# Converting matrix into a DataFrame 
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
print("Cosine Similarity Matrix (Example for first few customers):")
print(similarity_df.head())  

Cosine Similarity Matrix (Example for first few customers):
CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999914  0.999914  0.988903  0.999906  0.999237   
C0002       0.999914  1.000000  0.999870  0.987769  0.999958  0.998638   
C0003       0.999914  0.999870  1.000000  0.990158  0.999742  0.999000   
C0004       0.988903  0.987769  0.990158  1.000000  0.986870  0.990985   
C0005       0.999906  0.999958  0.999742  0.986870  1.000000  0.998750   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.999906  0.999181  0.995836  0.723385  ...  0.998516  0.999778   
C0002       0.999928  0.998887  0.996585  0.732355  ...  0.998943  0.999915   
C0003       0.999707  0.999518  0.995121  0.724622  ...  0.998072  0.999588   
C0004       0.986777  0.99

In [7]:
# Generate top 3 lookalikes for customers C0001 to C0020
lookalikes = {}
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    
# Print top 3 lookalikes for a few customers
print("Top 3 Lookalikes for Customers C0001 to C0020:")
for cust_id, lookalike in lookalikes.items():
    print(f"Customer {cust_id}: {lookalike}")

Top 3 Lookalikes for Customers C0001 to C0020:
Customer C0001: [('C0164', 0.9999994990303347), ('C0127', 0.9999980128582096), ('C0184', 0.9999945535029922)]
Customer C0002: [('C0031', 0.9999975753411989), ('C0197', 0.999995303479012), ('C0166', 0.9999907110058006)]
Customer C0003: [('C0160', 0.9999990501953097), ('C0190', 0.9999978447367519), ('C0106', 0.9999929246163431)]
Customer C0004: [('C0109', 0.9995206560178361), ('C0136', 0.9994815948467616), ('C0041', 0.9993756063666132)]
Customer C0005: [('C0159', 0.999997473571035), ('C0146', 0.9999954043317109), ('C0007', 0.9999949103943028)]
Customer C0006: [('C0079', 0.999999403565313), ('C0168', 0.9999650442922436), ('C0126', 0.999921847269312)]
Customer C0007: [('C0146', 0.9999998594297274), ('C0092', 0.9999986553454548), ('C0089', 0.9999983227376269)]
Customer C0008: [('C0124', 0.9999769627813653), ('C0090', 0.999968304426297), ('C0084', 0.9999361187675729)]
Customer C0009: [('C0177', 0.9999325544624587), ('C0130', 0.9998036888759837),

In [8]:
# Saving the results to Lookalike.csv
lookalike_df = pd.DataFrame([(key, *val) for key, values in lookalikes.items() for val in values],
                            columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
