<a href="https://colab.research.google.com/github/ArpitRawat07/Zeotap_Assignment/blob/main/Arpit_Rawat_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gdown



In [2]:
# Download Customers.csv
!gdown "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE" -O Customers.csv

Downloading...
From: https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/Customers.csv
  0% 0.00/8.54k [00:00<?, ?B/s]100% 8.54k/8.54k [00:00<00:00, 20.7MB/s]


In [3]:
# Download Products.csv
!gdown "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0" -O Products.csv

Downloading...
From: https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/Products.csv
  0% 0.00/4.25k [00:00<?, ?B/s]100% 4.25k/4.25k [00:00<00:00, 17.0MB/s]


In [4]:
# Download Transactions.csv
!gdown "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF" -O Transactions.csv

Downloading...
From: https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/Transactions.csv
  0% 0.00/54.7k [00:00<?, ?B/s]100% 54.7k/54.7k [00:00<00:00, 56.3MB/s]


In [10]:
# Importing required libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [6]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

In [7]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

## Data Preprocessing

In [8]:
# Aggregating transaction data by customer
transaction_data = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgSpent=('TotalValue', 'mean'),
    PurchaseCount=('TotalValue', 'count')
).reset_index()

## Feature Engineering

In [12]:
# One-hot encode 'Region' column for customers
encoder = OneHotEncoder(sparse_output=False)
region_encoded = encoder.fit_transform(customers[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

In [13]:
# Merging customer data with region data
customers = pd.concat([customers, region_df], axis=1)

In [15]:
# Merge transaction data with customer profile data
customer_features = pd.merge(customers, transaction_data, on='CustomerID')

## Compute Cosine Similarity

In [16]:
# Combine features: Profile features (Region) + Transaction features (TotalSpent, AvgSpent, etc.)
feature_columns = [col for col in customer_features.columns if col not in ['CustomerID', 'CustomerName', 'SignupDate', 'Region']]
X = customer_features[feature_columns].values

In [17]:
# Normalize the data (optional but can improve performance)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(X_scaled)

In [19]:
lookalike_map = {}

for idx in range(20):
    customer_id = customer_features.iloc[idx]['CustomerID']
    similarity_scores = similarity_matrix[idx]

    # Get the indices of the top 3 most similar customers (excluding the customer itself)
    similar_indices = similarity_scores.argsort()[-4:-1][::-1]

    similar_customers = []
    for sim_idx in similar_indices:
        similar_customer_id = customer_features.iloc[sim_idx]['CustomerID']
        similarity_score = similarity_scores[sim_idx]
        similar_customers.append((similar_customer_id, similarity_score))

    lookalike_map[customer_id] = similar_customers

## Results

In [21]:
# Flatten the dictionary for the DataFrame
lookalike_list = []
for cust_id, recommendations in lookalike_map.items():
    row = [cust_id]
    for rec in recommendations:
        row.append(rec[0])  # Lookalike customer ID
        row.append(rec[1])  # Score
    lookalike_list.append(row)

# Create DataFrame from the flattened list
lookalike_df = pd.DataFrame(lookalike_list)

# Set the proper column names
lookalike_df.columns = ['CustomerID', 'Lookalike_1', 'Score_1', 'Lookalike_2', 'Score_2', 'Lookalike_3', 'Score_3']

# Save the DataFrame to a CSV file
lookalike_df.to_csv("Lookalike.csv", index=False)