In [None]:
# Approach would be like this for Lookalike Model Implementation

# Step 1: Load the datasets

# Step 2: Merge the datasets
#   Merge Transactions.csv with Customers.csv using CustomerID
#   Merge the result with Products.csv using ProductID to get complete data

# Step 3: Feature Engineering
#   Calculate TotalSpending (sum of TotalValue), AvgTransactionValue (mean of TotalValue),
#   and PurchaseFrequency (count of transactions) for each customer
#   Extract MostPurchasedCategory (mode of product category for each customer)
#   Add Region from Customers.csv
#   Encode categorical variables like Region and MostPurchasedCategory using one-hot encoding

# Step 4: Normalize features
#   Use MinMaxScaler to scale numerical features: TotalSpending, AvgTransactionValue, and PurchaseFrequency

# Step 5: Compute Similarity Scores
#   Create a feature matrix excluding the CustomerID column
#   Computing pairwise cosine similarity between all customers using sklearn cosine_similarity

# Step 6: Generating Recommendations
#   For the first 20 customers (CustomerID: C0001 to C0020):
#    Retrieve similarity scores from the similarity matrix
#    Sort scores in descending order (excluding the customer self-similarity)
#    Selecting the top 3 similar customers and their similarity scores
#   Save the recommendations in a CSV file in format: Map<cust_id, List<cust_id, score>>

# Final Output:
#   A CSV file named 'Lookalike.csv' containing the top 3 lookalike customers for the first 20 customers.

In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv("/Users/abhay/Desktop/Zeotap/Customers.csv")  
products = pd.read_csv("/Users/abhay/Desktop/Zeotap/Products.csv")  
transactions = pd.read_csv("/Users/abhay/Desktop/Zeotap/Transactions.csv")


# Merging transactions with customers
merged_data = transactions.merge(customers, on='CustomerID', how='left')

# Merging the result with products
merged_data = merged_data.merge(products, on='ProductID', how='left')

# Hey Bhagwan Utha le ab hain : Feature Engineering
# Calculating total spending, average transaction value, and purchase frequency per customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).reset_index()

# Flatten the column names
def flatten_columns(df):
    df.columns = ['_'.join(col).strip('_') for col in df.columns.values]
    return df

customer_features = flatten_columns(customer_features)
customer_features.rename(columns={
    'CustomerID_': 'CustomerID',
    'TotalValue_sum': 'TotalSpending',
    'TotalValue_mean': 'AvgTransactionValue',
    'TransactionID_count': 'PurchaseFrequency',
    'Category_<lambda>': 'MostPurchasedCategory'
}, inplace=True)

# Adding region as a categorical feature
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')

# encoding categorical variables (jaise ki Region, MostPurchasedCategory)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'MostPurchasedCategory'], drop_first=True)

#  Normalizing features
scaler = MinMaxScaler()
numerical_cols = ['TotalSpending', 'AvgTransactionValue', 'PurchaseFrequency']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Computing  Similarity Scores
# Create the feature matrix
feature_matrix = customer_features.drop(columns=['CustomerID'])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Generate Recommendations
# Create a DataFrame to store the top 3 lookalike customers for each customer
recommendations = {}
for i, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the customer
    similarity_scores = list(enumerate(similarity_matrix[i]))
    # Sort scores in descending order, excluding self-similarity
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    # Store top 3 similar customers with scores
    recommendations[customer_id] = [(customer_features['CustomerID'][idx], score) for idx, score in sorted_scores]

# to the required CSV format
lookalike_list = []
for cust_id, similar_customers in recommendations.items():
    lookalike_list.append({
        'CustomerID': cust_id,
        'Lookalike': similar_customers
    })

lookalike_df = pd.DataFrame(lookalike_list)

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model recommendations saved to 'Lookalike.csv'")

Lookalike model recommendations saved to 'Lookalike.csv'
