In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from datetime import datetime

In [3]:
from google.colab import files

# Upload files manually
uploaded = files.upload()

# Access the uploaded files
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

Saving Customers.csv to Customers.csv
Uploaded file: Customers.csv


In [4]:
from google.colab import files

# Upload files manually
uploaded = files.upload()

# Access the uploaded files
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

Saving Products.csv to Products.csv
Uploaded file: Products.csv


In [5]:
from google.colab import files

# Upload files manually
uploaded = files.upload()

# Access the uploaded files
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

Saving Transactions.csv to Transactions.csv
Uploaded file: Transactions.csv


In [6]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Task 2: Lookalike Model


In [12]:
# Load customer and transaction data
customers = pd.read_csv("Customers.csv")  # Customer data
transactions = pd.read_csv("Transactions.csv")  # Transaction data

# Preview data
print(customers.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


In [14]:
class LookalikeFinder:
    def __init__(self, customers, products, transactions):
        self.customers = customers.copy()
        self.products = products
        self.transactions = transactions

        # Preprocess data
        self.preprocess_data()

    def preprocess_data(self):
        # Convert SignupDate to days since first signup
        self.customers['SignupDays'] = (pd.to_datetime(self.customers['SignupDate']) -
                                        pd.to_datetime(self.customers['SignupDate']).min()).dt.days

        # Aggregate transaction data per customer
        customer_transactions = self.transactions.groupby('CustomerID').agg({
            'TotalValue': ['sum', 'mean', 'count'],
            'Quantity': ['sum', 'mean']
        }).reset_index()
        customer_transactions.columns = ['CustomerID', 'TotalValue_Sum', 'TotalValue_Mean',
                                          'Transaction_Count', 'Quantity_Sum', 'Quantity_Mean']

        # Merge transaction data with customer data
        self.customer_features = self.customers.merge(customer_transactions, on='CustomerID')

        # One-hot encode Region
        region_dummies = pd.get_dummies(self.customers['Region'], prefix='Region')
        self.customer_features = pd.concat([self.customer_features, region_dummies], axis=1)

        # Select features
        feature_columns = [
            'SignupDays', 'TotalValue_Sum', 'TotalValue_Mean',
            'Transaction_Count', 'Quantity_Sum', 'Quantity_Mean'
        ] + list(region_dummies.columns)

        # Prepare data for scaling
        X = self.customer_features[feature_columns]

        # Impute missing values with mean
        imputer = SimpleImputer(strategy='mean')
        X_imputed = imputer.fit_transform(X)

        # Scale features
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(X_imputed)

    def find_lookalikes(self, customer_id, top_n=3):
        # Find index of target customer
        target_customer = self.customer_features[self.customer_features['CustomerID'] == customer_id]

        if len(target_customer) == 0:
            print(f"Customer {customer_id} not found.")
            return pd.DataFrame()

        target_idx = target_customer.index[0]

        # Compute cosine similarity
        similarities = cosine_similarity(self.X_scaled[target_idx].reshape(1, -1), self.X_scaled)[0]

        # Exclude self and sort
        similarities[target_idx] = -1  # Exclude self
        top_indices = np.argsort(similarities)[::-1][:top_n]

        # Prepare lookalike results
        lookalikes = self.customers.iloc[top_indices].copy()
        lookalikes['SimilarityScore'] = similarities[top_indices]

        return lookalikes[['CustomerID', 'CustomerName', 'Region', 'SimilarityScore']]

    def generate_lookalike_map(self, start_id='C0001', end_id='C0020'):
        lookalike_map = []

        # Convert CustomerID to strings if they aren't already
        self.customers['CustomerID'] = self.customers['CustomerID'].astype(str)

        # Filter customers within the specified ID range
        filtered_customers = self.customers[
            (self.customers['CustomerID'] >= start_id) &
            (self.customers['CustomerID'] <= end_id)
        ]

        for cust_id in filtered_customers['CustomerID']:
            lookalikes = self.find_lookalikes(cust_id)
            for _, row in lookalikes.iterrows():
                lookalike_map.append({
                    'SourceCustomer': cust_id,
                    'LookalikeCustomer': row['CustomerID'],
                    'SimilarityScore': row['SimilarityScore']
                })

        return pd.DataFrame(lookalike_map)

In [15]:
# Initialize and run
lookalike_finder = LookalikeFinder(customers, products, transactions)
lookalike_result = lookalike_finder.generate_lookalike_map()

In [17]:
# Save results
lookalike_result.to_csv('Suprith_Shettigar_Lookalike.csv', index=False)
print("Lookalike model completed. Results saved to Anushka_Gupta_Lookalike.csv")

Lookalike model completed. Results saved to Anushka_Gupta_Lookalike.csv
