### Importing the libraries

In [51]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


### Load the datasets


In [54]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


### Feature Engineering



### Aggregate transaction-level data into customer-level data


In [109]:
customer_features = transactions.groupby("CustomerID").agg(
    {
        "TransactionID": "count",
        "TotalValue": "sum",  
        "ProductID": lambda x: x.mode()[0] if not x.mode().empty else None,  
    }
).rename(columns={
    "TransactionID": "PurchaseFrequency",
    "TotalValue": "TotalSpend",
    "ProductID": "MostPurchasedProduct"
}).reset_index()

### Merge with customer demographic data


In [61]:
customer_features = pd.merge(customer_features, customers, on="CustomerID", how="left")




### One-hot encode the 'Region' column



In [64]:
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customer_features[["Region"]]).toarray()
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))


### Concatenate with original features

In [67]:
customer_features = pd.concat([customer_features, region_encoded_df], axis=1)
customer_features.drop(columns=["Region", "CustomerName", "SignupDate"], inplace=True)

### Normalize numerical columns

In [70]:
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features[["PurchaseFrequency", "TotalSpend"]])
normalized_features_df = pd.DataFrame(normalized_features, columns=["NormPurchaseFrequency", "NormTotalSpend"])


### Add normalized columns to the feature set

In [73]:
customer_features = pd.concat([customer_features.reset_index(drop=True), normalized_features_df], axis=1)



### Computing Similarities


### Selecting only numerical features for similarity computation


In [76]:
similarity_data = customer_features.drop(columns=["CustomerID", "PurchaseFrequency", "TotalSpend", "MostPurchasedProduct"])

### Compute cosine similarity


In [80]:
similarity_matrix = cosine_similarity(similarity_data)

### Convert the similarity matrix to a DataFrame for easier interpretation


In [83]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])



### Step 4: Generate Lookalike Recommendations


In [86]:
lookalike_map = {}

### Find top 3 most similar customers for the first 20 customers


In [111]:
for customer_id in customer_features["CustomerID"][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  
    lookalike_map[customer_id] = [(similar_id, score) for similar_id, score in similar_customers.items()]



### Preparing Lookalike.csv


In [92]:
lookalike_rows = []
for customer_id, lookalikes in lookalike_map.items():
    for similar_id, score in lookalikes:
        lookalike_rows.append({
            "TargetCustomerID": customer_id,
            "LookalikeCustomerID": similar_id,
            "SimilarityScore": score
        })

lookalike_df = pd.DataFrame(lookalike_rows)
lookalike_df.to_csv("Lookalike.csv", index=False)

