In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert to datetime format
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Handle missing values
customers_df.fillna("Unknown", inplace=True)
products_df.fillna("Unknown", inplace=True)
transactions_df.fillna(0, inplace=True)

customers_df['SignupYear'] = customers_df['SignupDate'].dt.year
customers_df['SignupMonth'] = customers_df['SignupDate'].dt.month
customers_df['SignupDay'] = customers_df['SignupDate'].dt.day
customers_df.drop(columns=['SignupDate'], inplace=True)

customer_transaction_summary = transactions_df.groupby('CustomerID').agg(
    TransactionCount=('TransactionID', 'count'),
    TotalPurchaseValue=('TotalValue', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    TotalItemsPurchased=('Quantity', 'sum')
).reset_index()

product_category_diversity = transactions_df.groupby('CustomerID')['ProductID'].apply(
    lambda x: len(products_df[products_df['ProductID'].isin(x)]['Category'].unique())
).reset_index(name='ProductCategoryDiversity')

merged_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

customer_product_summary = merged_df.groupby(['CustomerID', 'ProductID']).agg(
    Quantity=('Quantity', 'sum'),
    TotalValue=('TotalValue', 'sum')
).unstack(fill_value=0)
customer_product_summary.columns = ['_'.join(map(str, col)) for col in customer_product_summary.columns]

customer_features = customers_df.set_index('CustomerID').drop(columns=['CustomerName'])


encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = encoder.fit_transform(customer_features[['Region']])
categorical_df = pd.DataFrame(
    categorical_encoded,
    index=customer_features.index,
    columns=encoder.get_feature_names_out(['Region'])
)

final_df = (customer_product_summary
    .merge(customer_features.drop(columns=['Region']), left_index=True, right_index=True)
    .merge(customer_transaction_summary.set_index('CustomerID'), left_index=True, right_index=True)
    .merge(product_category_diversity.set_index('CustomerID'), left_index=True, right_index=True)
    .join(categorical_df)
)

# Normalize
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_df)

similarity_matrix = cosine_similarity(scaled_features)

customer_ids = final_df.index
lookalike_results = {}

for i in range(min(20, len(customer_ids))):
    customer_id = customer_ids[i]
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    top_lookalikes = [(customer_ids[j], round(score, 4)) for j, score in similarities[1:4]]
    lookalike_results[customer_id] = top_lookalikes

lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
})

# Save to CSV
lookalike_df.to_csv('Akshay_Shetty_Lookalike.csv', index=False)

print("Lookalike analysis completed successfully.")


Lookalike analysis completed successfully.
