# Task 2: Lookalike Model

The goal of the lookalike model is to identify similar customers based on their profile and transaction history

# Step 1: Data Preprocessing

In [3]:
import pandas as pd

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


#  Cleaning Data

Check for missing values, duplicate records, or invalid data (e.g., negative prices or quantities

In [4]:
customers_df.isnull().sum()
products_df.isnull().sum()
transactions_df.isnull().sum()

customers_df.drop_duplicates(inplace=True)
products_df.drop_duplicates(inplace=True)
transactions_df.drop_duplicates(inplace=True)


#  Merging DataFrames

Merging the datasets based on relevant keys (e.g., CustomerID, ProductID) to create a unified DataFrame that includes customer information, transaction details, and product details.

In [5]:
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')


# step 2 : Feature Engineering

To compare customers, we need to define useful features that summarize their behavior and profile. These features will be used to compute the similarity between customers.

 # Customer Profiling

Calculating metrics such as the total value of purchases, frequency of purchases, and average transaction amount.

In [6]:
customer_profile = merged_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()


# Product Preferences

creating features based on the products customers buy, such as the most frequently purchased product category or total value spent on different categories.



In [8]:
customer_product_preference = merged_df.groupby(['CustomerID', 'Category']).agg(
    total_spent_on_category=('TotalValue', 'sum')
).unstack().fillna(0)


# Step 3: Calculating Similarity

# Combining Features

Combining customer profile data with product preferences data.

In [16]:
# Rename columns to ensure they are properly labeled
customer_product_preference.columns = ['CustomerID', 'EmptyColumn', 'Books', 'Clothing', 'Electronics', 'Home Decor']

# Drop the 'EmptyColumn' as it seems unnecessary
customer_product_preference = customer_product_preference.drop(columns=['EmptyColumn'])

# Now check if columns are renamed correctly
print(customer_product_preference.columns)


Index(['CustomerID', 'Books', 'Clothing', 'Electronics', 'Home Decor'], dtype='object')


In [18]:
# Convert 'CustomerID' to string in both dataframes
customer_profile['CustomerID'] = customer_profile['CustomerID'].astype(str)
customer_product_preference['CustomerID'] = customer_product_preference['CustomerID'].astype(str)

# Now perform the merge
final_features = pd.merge(customer_profile, customer_product_preference, on='CustomerID')

# Check the merged result
print(final_features.head())


Empty DataFrame
Columns: [CustomerID, total_spent, transaction_count, avg_transaction_value, Books, Clothing, Electronics, Home Decor]
Index: []


# Compute Similarity

Using  a similarity measure such as Cosine Similarity  to calculating the similarity between customers.

In [26]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

# Step 1: Create customer profile
customer_profile = transactions.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    transaction_count=pd.NamedAgg(column='TransactionID', aggfunc='count'),
    avg_transaction_value=pd.NamedAgg(column='TotalValue', aggfunc='mean')
).reset_index()

# Step 2: Create customer product preferences
customer_product_preference = transactions.merge(products, on='ProductID').pivot_table(
    index='CustomerID',
    columns='Category',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Step 3: Merge customer profile with product preferences
final_features = pd.merge(customer_profile, customer_product_preference, on='CustomerID')

# Step 4: Normalize the features
features_to_normalize = final_features.columns.difference(['CustomerID'])
scaler = StandardScaler()
final_features[features_to_normalize] = scaler.fit_transform(final_features[features_to_normalize])

# Step 5: Compute similarity using cosine similarity
similarity_matrix = cosine_similarity(final_features[features_to_normalize])
similarity_df = pd.DataFrame(similarity_matrix, index=final_features['CustomerID'], columns=final_features['CustomerID'])



# generating looklike csv for first 20 customers

In [27]:
# Step 6: Generate Lookalike recommendations
lookalikes = {}
for customer_id in final_features['CustomerID'][:20]:  # Top 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer_id] = [(idx, score) for idx, score in similar_customers.items()]

# Step 7: Save lookalike results to a CSV file
lookalike_results = [
    {"CustomerID": cust_id, "Lookalikes": str(recommendations)}
    for cust_id, recommendations in lookalikes.items()
]
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed. Results saved to 'Lookalike.csv'.")


Lookalike model completed. Results saved to 'Lookalike.csv'.
