In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
customer = pd.read_csv("/content/drive/MyDrive/Zeotap/Customers.csv")
product = pd.read_csv("/content/drive/MyDrive/Zeotap/Products.csv")
transaction = pd.read_csv("/content/drive/MyDrive/Zeotap/Transactions.csv")

In [3]:
customer.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
product.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transaction.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


# Data Preparation

In [7]:
transaction = transaction.merge(product, on='ProductID')
customer_transactions = transaction.merge(customer, on='CustomerID')

In [14]:
customer_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   ProductName      1000 non-null   object 
 8   Category         1000 non-null   object 
 9   Price_y          1000 non-null   float64
 10  CustomerName     1000 non-null   object 
 11  Region           1000 non-null   object 
 12  SignupDate       1000 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


# Feature Engineering

In [16]:
# Aggregate transaction data by customer
features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'ProductID': lambda x: list(x),
    'Price_x': 'mean',  # Changed from 'Price' to 'Price_x'
    'Category': lambda x: list(set(x))
}).rename(columns={'TotalValue': 'Total_Spend', 'ProductID': 'Products_Purchased', 'Price_x': 'Average_Price', 'Category': 'Categories_Purchased'})

# Join features with customer demographics
customer_features = customer.merge(features, on='CustomerID')


# Model Development

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Example: Numerical features scaling
scaler = StandardScaler()
numerical_features = scaler.fit_transform(customer_features[['Total_Spend', 'Average_Price']])

# Cosine similarity matrix
similarity_matrix = cosine_similarity(numerical_features)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


# Generate Recommendations

In [18]:
def get_top_matches(customer_id, similarity_df, top_n=3):
    # Get similarity scores for the given customer with all others
    scores = similarity_df[customer_id].sort_values(ascending=False)
    # Remove self-similarity from consideration
    scores = scores[scores.index != customer_id]
    return scores.head(top_n)

# Example for customer C0001
top_matches = get_top_matches('C0001', similarity_df)

# Save results for first 20 customers
lookalikes = {}
for cust_id in customer_features['CustomerID'][:20]:
    lookalikes[cust_id] = get_top_matches(cust_id, similarity_df).to_dict()

lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')
lookalike_df.to_csv('Lookalike.csv')
