## Preparing a Lookalike Model 

#### Loading the datasets

In [None]:
import pandas as pd
customers_file = './customers.csv'
products_file = './products.csv'
transactions_file = './transaction.csv'
customers = pd.read_csv(customers_file)
products = pd.read_csv(products_file)
transactions = pd.read_csv(transactions_file)
# Displaying the first few rows of each dataset
customers.head(), products.head(), transactions.head()


#### Preprocessing data and merge datasets

In [None]:
# Converting dates to datetime format 
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"], format="%d-%m-%Y", errors="coerce")
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"], format="%d-%m-%Y %H:%M", errors="coerce")
transactions_products = pd.merge(transactions, products, on="ProductID", how="left")
full_data = pd.merge(transactions_products, customers, on="CustomerID", how="left")
# Displaying a sample of the merged dataset
full_data.head()


#### Feature engineering

Feature engineering is the process of transforming raw data into meaningful features that improve the performance of machine learning models. It involves selecting, modifying, or creating new features from existing data to make patterns and insights more accessible to algorithms. This process is critical for ensuring that the model can effectively learn from the data.

In [None]:
# Total spending and number of transactions per customer
customer_spending = full_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean")
).reset_index()
# Preferred product categories by the proportion of purchases in each category
category_preferences = pd.crosstab(full_data["CustomerID"], full_data["Category"], normalize="index")
# Merging spending and category preferences into a single dataset
customer_features = pd.merge(customer_spending, category_preferences, on="CustomerID", how="left")
# Displaying the processed customer features
customer_features.head()


#### Why MinMaxScaler ?

MinMaxScaler is chosen because it ensures consistent scaling within a fixed range, making it ideal for cosine similarity, which relies on uniform feature representation. Other techniques may introduce issues such as negative values, inconsistent scaling, or sensitivity to outliers, making them less suitable for this specific task.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Normalizing features 
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID

# Computing cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_features)

# Creating a dataFrame to map customers to their similarities
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])


In [None]:
# Get the top 3 similar customers for each of the first 20 customers
top_20_customers = customer_features["CustomerID"][:20]
lookalike_results = {}
for cust_id in top_20_customers:
    similar_customers = (
        similarity_df[cust_id]
        .sort_values(ascending=False)
        .iloc[1:4] 
    )
    lookalike_results[cust_id] = list(zip(similar_customers.index, similar_customers.values))
# Convert the results to the csv format 
lookalike_output = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Top_Lookalikes": [str(v) for v in lookalike_results.values()]
})
lookalike_output.head()


#### Calculating the average similarity 

In [None]:
average_similarity = []
for cust_id in top_20_customers:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
    average_similarity.append(similar_customers.mean())

# Display the overall average similarity
overall_average_similarity = np.mean(average_similarity)
print(f"Overall Average Similarity for Top 3 Recommendations: {overall_average_similarity:.2f}")


#### Saving the output in a new csv file

In [None]:
# Save the lookalike results to a CSV file
output_file = "Fayaz_Hussain_Lookalike.csv"
lookalike_output.to_csv(output_file, index=False)
print(f"Lookalike results saved to {output_file}")