In [2]:
import pandas as pd

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [3]:
# Quick look at the data
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [4]:
# Check for missing values
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())


CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [5]:
# Check for duplicates and drop them
customers.drop_duplicates(inplace=True)
products.drop_duplicates(inplace=True)
transactions.drop_duplicates(inplace=True)

In [6]:
# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Group data by customers
customer_profile = merged_data.groupby("CustomerID").agg({
    'Region': 'first',
    'ProductName': lambda x: list(x),
    'Category': lambda x: list(x),
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Encode categorical variables (if needed)
customer_profile = pd.get_dummies(customer_profile, columns=["Region"])


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert Product and Category preferences into a single text for simplicity
customer_profile['Preferences'] = customer_profile['ProductName'].apply(lambda x: ' '.join(x))

# Use CountVectorizer to transform text data into vectors
vectorizer = CountVectorizer()
preference_vectors = vectorizer.fit_transform(customer_profile['Preferences'])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(preference_vectors)

# Store similarity scores for each customer
similarity_scores = {}
for idx, customer_id in enumerate(customer_profile['CustomerID']):
    similar_customers = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 lookalikes
    similarity_scores[customer_id] = [(customer_profile['CustomerID'][i], score) for i, score in similar_customers]


In [9]:
import csv

# Filter the first 20 customers
first_20_customers = customer_profile[customer_profile['CustomerID'].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

# Write the results to a CSV file
with open('Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalikes'])

    for customer_id in first_20_customers['CustomerID']:
        lookalikes = similarity_scores[customer_id]
        writer.writerow([customer_id, lookalikes])
