In [7]:
#import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
#load datasets

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [13]:
#analyze data

print(customers.head())
print("\n")
print(products.head())
print("\n")
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3   

In [15]:
#check for missing values

print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [17]:
#prepare data

merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [33]:
# Feature Engineering
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': lambda x: x.nunique(),
    'Region': 'first'
}).reset_index()

customer_features.rename(columns={'ProductID': 'UniqueProducts'}, inplace=True)

#convert categorical data into numerical data
customer_features = pd.get_dummies(customer_features, columns=['Region'])


In [37]:
# Normalize Data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [41]:
# Compute cosine Similarity
similarity = cosine_similarity(features_scaled)

In [43]:
# Recommend Lookalikes
def lookalikes(index, top_n=3):
    scores = list(enumerate(similarity[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    return [(customer_features.iloc[i]['CustomerID'], score) for i, score in scores]
    

In [47]:
# Generate Recommendations
results = []
for idx in range(len(customer_features)):
    temp = lookalikes(idx)
    results.append({
        'CustomerID': customer_features.iloc[idx]['CustomerID'],
        'LookalikeID1': temp[0][0], 'Score1': temp[0][1],
        'LookalikeID2': temp[1][0], 'Score2': temp[1][1],
        'LookalikeID3': temp[2][0], 'Score3': temp[2][1],
    })

In [53]:
# Save to CSV
lookalike_df = pd.DataFrame(results)
lookalike_df.to_csv('Lookalike.csv', index=False)

In [55]:
output = pd.read_csv('Lookalike.csv')
print(output.head())

  CustomerID LookalikeID1    Score1 LookalikeID2    Score2 LookalikeID3  \
0      C0001        C0107  0.996725        C0137  0.996063        C0191   
1      C0002        C0142  0.994534        C0177  0.986784        C0178   
2      C0003        C0133  0.966381        C0190  0.966171        C0174   
3      C0004        C0113  0.988351        C0102  0.972712        C0104   
4      C0005        C0186  0.998580        C0159  0.998260        C0007   

     Score3  
0  0.990618  
1  0.978913  
2  0.952801  
3  0.972457  
4  0.992426  
