Task 2: Lookalike Model

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets for comprehensive analysis
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

# Feature Engineering
# Aggregate transaction data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'Category': lambda x: ' '.join(x),  # Concatenate product categories
    'TotalValue': 'sum',               # Total spending
    'Quantity': 'sum'                  # Total quantity purchased
}).reset_index()

# Normalize numerical features
scaler = MinMaxScaler()
customer_features[['TotalValue', 'Quantity']] = scaler.fit_transform(
    customer_features[['TotalValue', 'Quantity']]
)

# Combine Region and Category into a single text field
customer_features['CombinedText'] = (
    customer_features['Region'] + ' ' + customer_features['Category']
)

# Use TF-IDF to vectorize the text data (Region + Product Categories)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(customer_features['CombinedText'])

# Combine text-based and numerical features
from scipy.sparse import hstack
combined_features = hstack([tfidf_matrix, customer_features[['TotalValue', 'Quantity']]])

# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(combined_features)

# Recommend top 3 similar customers for the first 20 customers (C0001-C0020)
lookalike_results = {}

for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity scores in descending order (excluding self)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = [(customer_features['CustomerID'][i], score) for i, score in similarity_scores if i != idx]
    # Select top 3 similar customers
    top_3 = similarity_scores[:3]
    lookalike_results[customer_id] = top_3

# Convert results to a dataframe
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

# Save as CSV
lookalike_df.to_csv('Chukka_DhanyaDeepika_Lookalike.csv', index=False)
print("Lookalike results saved to Chukka_DhanyaDeepika_Lookalike.csv.")

Lookalike results saved to Chukka_DhanyaDeepika_Lookalike.csv.
