In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('data/Customers.csv')
products = pd.read_csv('data/Products.csv')
transactions = pd.read_csv('data/Transactions.csv')

In [3]:
customers.columns = customers.columns.str.strip().str.lower()
products.columns = products.columns.str.strip().str.lower()
transactions.columns = transactions.columns.str.strip().str.lower()


In [4]:
products['price'] = pd.to_numeric(products['price'], errors='coerce')


In [5]:
products = products.dropna(subset=['price'])


In [6]:
print("Products DataFrame:")
print(products.head())
print(products.columns)

Products DataFrame:
  productid              productname     category   price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
Index(['productid', 'productname', 'category', 'price'], dtype='object')


In [7]:
merged_data = transactions.merge(customers, on='customerid').merge(products, on='productid')


In [8]:
merged_data.rename(columns={'price_x': 'transaction_price', 'price_y': 'product_price'}, inplace=True)


In [9]:
print("Merged DataFrame:")
print(merged_data.head())
print(merged_data.columns)

Merged DataFrame:
  transactionid customerid productid      transactiondate  quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   totalvalue  transaction_price     customername         region  signupdate  \
0      300.68             300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68             300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68             300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36             300.68  Travis Campbell  South America  2024-04-11   
4      902.04             300.68    Timothy Perez         Europe  2022-03-15   

                       productname     category  product_price  
0  Comfor

In [10]:
user_profiles = merged_data.groupby('customerid').agg({
    'totalvalue': 'sum',
    'quantity': 'sum',
    'product_price': 'mean'
}).reset_index()


In [11]:
scaler = StandardScaler()
user_profiles_scaled = scaler.fit_transform(user_profiles.iloc[:, 1:])


In [None]:
similarities = cosine_similarity(user_profiles_scaled)
similarity_df = pd.DataFrame(similarities, index=user_profiles['customerid'], columns=user_profiles['customerid'])


In [None]:
lookalike_results = {}
for customer_id in user_profiles['customerid'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))


In [None]:
lookalike_df = pd.DataFrame({
    'customerid': lookalike_results.keys(),
    'lookalikes': lookalike_results.values()
})


In [None]:
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'.")