In [1]:
#importing the necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

## Loading the data

In [2]:
# load the data from CSV files
customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')

# merge the datasets based on common keys CustomerID and ProductID
merged_data = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

In [3]:
customers_df.dtypes

Unnamed: 0,0
CustomerID,object
CustomerName,object
Region,object
SignupDate,object


In [4]:
products_df.dtypes

Unnamed: 0,0
ProductID,object
ProductName,object
Category,object
Price,float64


In [5]:
transactions_df.dtypes

Unnamed: 0,0
TransactionID,object
CustomerID,object
ProductID,object
TransactionDate,object
Quantity,int64
TotalValue,float64
Price,float64


## Feature Engineering

We are aggregating the transaction data for each customer:
- TotalTransactionValue: Sum of transaction values
- TransactionCount: Count of transactions


In [6]:
customer_features = merged_data.groupby('CustomerID').agg(
    TotalTransactionValue=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'count'),
).reset_index()

print(customer_features)

    CustomerID  TotalTransactionValue  TransactionCount
0        C0001                3354.52                 5
1        C0002                1862.74                 4
2        C0003                2725.38                 4
3        C0004                5354.88                 8
4        C0005                2034.24                 3
..         ...                    ...               ...
194      C0196                4982.88                 4
195      C0197                1928.65                 3
196      C0198                 931.83                 2
197      C0199                1979.28                 4
198      C0200                4758.60                 5

[199 rows x 3 columns]


## One-Hot Encoding

We want to transform categorical variables (like Category and Region) into a numerical format by creating a binary column for each unique category, where 1 indicates the presence of a category and 0 its absence.

In [7]:
# Creates binary columns for each category the customer purchased
category_features = pd.get_dummies(merged_data[['CustomerID', 'Category']], columns=['Category'])

# One-hot encode customer regions for each region the customer belongs to
region_features = pd.get_dummies(customers_df[['CustomerID', 'Region']], columns=['Region'])

In [8]:
# Combine all features into a single dataset
# Merges aggregated transaction features, one-hot-encoded categories, and regions
final_features = (
    customer_features
    .merge(category_features.groupby('CustomerID').sum(), on='CustomerID')  # Sum category features per customer
    .merge(region_features, on='CustomerID')  # Add region features
)

##Normalization

In [9]:
# normalizing the numeric features (mean = 0, standard deviation = 1)
scaler = StandardScaler()
normalized_features = pd.DataFrame(
    scaler.fit_transform(final_features.drop(columns=['CustomerID'])),  #except 'CustomerID'
    columns=final_features.columns[1:],  # keep column names except 'CustomerID'
    index=final_features['CustomerID']   # use 'CustomerID' as the index
)

## Cosine Similarity Calculation

In [10]:
# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(normalized_features)

# convert similarity matrix into a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=final_features['CustomerID'], columns=final_features['CustomerID'])

## Recommendation Generation

In [11]:
# recommend top 3 lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customers_df['CustomerID'][:20]:  # iterate over the first 20 customers
    top_3 = (
        similarity_df[customer_id]  # select the similarity scores for the current customer
        .sort_values(ascending=False)  # sort scores in descending order
        .iloc[1:4]  # exclude the first entry (customer itself) and get the top 3 most similar customers
    )
    # store the top 3 lookalikes with similarity scores
    lookalike_results[customer_id] = list(zip(top_3.index, top_3.values))

In [13]:
# creating a DataFrame to save the lookalike results in the desired format
lookalike_df = pd.DataFrame(
    [
        {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)}  #format: CustomerID, [List of (LookalikeID, Score)]
        for cust_id, lookalikes in lookalike_results.items()
    ]
)
# saving the results as a CSV file
lookalike_df.to_csv('AryanSai_Arvapelly_Lookalike.csv', index=False)

print("Lookalike.csv has been generated!")

Lookalike.csv has been generated!
