<a href="https://colab.research.google.com/github/Amruth-varsh/Data-Science-Assignment-eCommerce-Transactions-Dataset-/blob/main/Pettem_Amruthvarsh_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
# read csv file
df_customers = pd.read_csv('/content/Customers.csv')
df_products = pd.read_csv('/content/Products.csv')
df_transactions = pd.read_csv('/content/Transactions.csv')
df_customers.head()
df_products.head()
df_transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [2]:
# Lookalike modeling
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load and prepare the data
def load_data():
    customers = pd.read_csv('Customers.csv')
    products = pd.read_csv('Products.csv')
    transactions = pd.read_csv('Transactions.csv')
    return customers, products, transactions

# 2. Feature Engineering
def create_customer_features(customers, transactions, products):
    # Customer Transaction Aggregates
    customer_transactions = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean', 'std']
    }).fillna(0)

    # Flatten column names
    customer_transactions.columns = ['_'.join(col).strip() for col in customer_transactions.columns.values]

    # Customer Product Category Preferences
    product_categories = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID')
    category_pivot = pd.crosstab(product_categories['CustomerID'], product_categories['Category'])
    category_pivot = category_pivot.div(category_pivot.sum(axis=1), axis=0)

    # Customer Recency, Frequency, Monetary (RFM) Features
    current_date = pd.to_datetime('2025-01-27')  # Using the provided current date
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    customers['account_age'] = (current_date - customers['SignupDate']).dt.days

    last_transaction_dates = transactions.groupby('CustomerID')['TransactionDate'].max()
    last_transaction_dates = pd.to_datetime(last_transaction_dates)
    recency = (current_date - last_transaction_dates).dt.days

    # Combine all features
    customer_features = pd.concat([
        customer_transactions,
        category_pivot,
        pd.DataFrame({'account_age': customers['account_age'],
                      'recency': recency}),
        pd.get_dummies(customers['Region'])
    ], axis=1).fillna(0)

    return customer_features

# 3. Calculate Similarity Scores
def calculate_similarity(customer_features):
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features)
    similarity_matrix = cosine_similarity(scaled_features)
    return similarity_matrix

# 4. Get Top Lookalikes
def get_top_lookalikes(customer_id, similarity_matrix, customer_features, n=3):
    customer_index = customer_features.index.get_loc(customer_id)
    similarities = similarity_matrix[customer_index]

    # Get top n similar customers (excluding self)
    similar_indices = np.argsort(similarities)[::-1][1:n+1]
    similar_scores = similarities[similar_indices]
    similar_customers = customer_features.index[similar_indices]

    return list(zip(similar_customers, similar_scores))

In [4]:
# Load data
customers, products, transactions = load_data()

# Create features
customer_features = create_customer_features(customers, transactions, products)

# Calculate similarity matrix
similarity_matrix = calculate_similarity(customer_features)

# Generate lookalikes for first 20 customers
lookalike_results = {}
for cust_id in customers['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(cust_id, similarity_matrix, customer_features)
    lookalike_results[cust_id] = [
        {'customer_id': cust, 'similarity_score': float(score)}
        for cust, score in lookalikes
    ]

# Create and save the results DataFrame
results_list = []
for cust_id, lookalikes in lookalike_results.items():
    row = {
        'customer_id': cust_id,
        'lookalike_1': lookalikes[0]['customer_id'],
        'score_1': round(lookalikes[0]['similarity_score'], 4),
        'lookalike_2': lookalikes[1]['customer_id'],
        'score_2': round(lookalikes[1]['similarity_score'], 4),
        'lookalike_3': lookalikes[2]['customer_id'],
        'score_3': round(lookalikes[2]['similarity_score'], 4)
    }
    results_list.append(row)

results_df = pd.DataFrame(results_list)
results_df.to_csv('Lookalike.csv', index=False)
print(results_df)

   customer_id lookalike_1  score_1 lookalike_2  score_2 lookalike_3  score_3
0        C0001       C0035   0.9318       C0069   0.9116       C0146   0.8787
1        C0002       C0133   0.9477       C0134   0.9294       C0106   0.8571
2        C0003       C0166   0.8962       C0129   0.8848       C0106   0.8591
3        C0004       C0017   0.9740       C0075   0.9540       C0122   0.9535
4        C0005       C0197   0.9475       C0069   0.9359       C0007   0.8904
5        C0006       C0135   0.9693       C0187   0.9288       C0153   0.9106
6        C0007       C0199   0.9055       C0026   0.8994       C0035   0.8920
7        C0008       C0162   0.9571       C0113   0.9345       C0181   0.9252
8        C0009       C0033   0.8652       C0150   0.8419       C0058   0.7955
9        C0010       C0030   0.9411       C0034   0.9173       C0061   0.9150
10       C0011       C0126   0.9754       C0027   0.9230       C0064   0.9200
11       C0012       C0065   0.9588       C0104   0.9536       C