# Task 2: Lookalike Model

### 2.1 Data Preprocessing

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load datasets
customers = pd.read_csv(r'C:\Users\hp\Downloads\Assignment_datasets_Customers\Customers.csv')
products = pd.read_csv(r'C:\Users\hp\Downloads\Assignment_datasets_Products\Products.csv')
transactions = pd.read_csv(r'C:\Users\hp\Downloads\Assignment_datasets_Transactions\Transactions.csv')



In [3]:
# Merging datasets to get customer and product information in one DataFrame
merged_data = pd.merge(transactions, customers, on='CustomerID', how='inner')
merged_data = pd.merge(merged_data, products, on='ProductID', how='inner')

# Check the merged data
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

### 2.2 Feature Engineering

In [4]:
# Aggregate data to get total spent by each customer
customer_profile = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'  
}).reset_index()

print(customer_profile.head())


  CustomerID  TotalValue  Quantity     Price_x
0      C0001     3354.52        12  278.334000
1      C0002     1862.74        10  208.920000
2      C0003     2725.38        14  195.707500
3      C0004     5354.88        23  240.636250
4      C0005     2034.24         7  291.603333


### 2.3 Building the Lookalike Model

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
customer_profile_scaled = scaler.fit_transform(customer_profile[['TotalValue', 'Quantity', 'Price_x']])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_profile_scaled)

# Create a function to get top 3 lookalike customers for a given customer
def get_lookalike_customers(customer_id, top_n=3):
    customer_idx = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
    sorted_similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    return [(customer_profile.iloc[i[0]]['CustomerID'], i[1]) for i in sorted_similar_customers]

# Get lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customer_profile['CustomerID'].head(20):
    lookalike_results[customer_id] = get_lookalike_customers(customer_id)

# Display the results
for cust_id, lookalikes in lookalike_results.items():
    print(f"Customer {cust_id}: {lookalikes}")


Customer C0001: [('C0103', 0.9975729385618538), ('C0092', 0.9968787968825864), ('C0135', 0.9927364238882177)]
Customer C0002: [('C0029', 0.9998543931340029), ('C0077', 0.9961038168882547), ('C0157', 0.9954784900159904)]
Customer C0003: [('C0111', 0.9984874468302141), ('C0190', 0.9966561574371822), ('C0038', 0.9901332836738033)]
Customer C0004: [('C0165', 0.9983897071764074), ('C0162', 0.9980867096016259), ('C0075', 0.996932345616167)]
Customer C0005: [('C0167', 0.9999721868436701), ('C0020', 0.99971426883456), ('C0128', 0.9987615592886807)]
Customer C0006: [('C0168', 0.9976122332196319), ('C0196', 0.9950250564515252), ('C0187', 0.9947524750205508)]
Customer C0007: [('C0125', 0.9998486580402707), ('C0089', 0.99834375759003), ('C0085', 0.9960335186380587)]
Customer C0008: [('C0084', 0.9960866913262758), ('C0113', 0.9958170325568012), ('C0017', 0.993173208985394)]
Customer C0009: [('C0130', 0.9999651017117013), ('C0128', 0.9985963548763069), ('C0192', 0.9985908489461927)]
Customer C0010: 

### 2.4 Saving Results

In [6]:
import csv

# Save the lookalike results to a CSV file
with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalike_Customers_and_Similarity_Scores'])
    
    for cust_id, lookalikes in lookalike_results.items():
        writer.writerow([cust_id, lookalikes])
