# Lookalike Model

In [16]:
!pip install pandas scikit-learn




[notice] A new release of pip is available: 23.2.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load the Data

In [17]:
import pandas as pd

# Load datasets
customers = pd.read_csv("./Customers.csv")
products = pd.read_csv("./Products.csv")
transactions = pd.read_csv("./Transactions.csv")

# Merge data for comprehensive analysis
transactions = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

## Feature Engineering

In [18]:
# Aggregate transaction-level data for each customer
customer_features = transactions.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),
    total_quantity=("Quantity", "sum"),
    num_transactions=("TransactionID", "count"),
).reset_index()

# Merge with customer profile data
customer_features = customer_features.merge(
    customers[["CustomerID", "Region", "SignupDate"]],
    on="CustomerID"
)

# Extract Signup Year
customer_features["SignupYear"] = pd.to_datetime(customer_features["SignupDate"]).dt.year
customer_features.drop(columns=["SignupDate"], inplace=True)

In [19]:
from sklearn.preprocessing import StandardScaler

# Select numerical features for scaling
numerical_cols = ["total_spending", "total_quantity", "num_transactions"]
scaler = StandardScaler()
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

In [20]:
# Check if the 'Region' column exists before applying one-hot encoding
if "Region" in customer_features.columns:
    customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)
else:
    print("Region column not found. Skipping one-hot encoding.")


## Calculate Similarity

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
features = customer_features.drop(columns=["CustomerID"])
similarity_matrix = cosine_similarity(features)

# Create a DataFrame for similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_features["CustomerID"], 
    columns=customer_features["CustomerID"]
)

### Generate Lookalikes

In [22]:
# Extract top 20 customers
target_customers = customers["CustomerID"][:20]

# Generate Lookalike recommendations
lookalikes = {}
for cust_id in target_customers:
    # Get similarity scores for the customer
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
    
    # Store results in a dictionary
    lookalikes[cust_id] = [(sim_cust, round(score, 4)) for sim_cust, score in similar_customers.items()]

# Create a DataFrame for Lookalike.csv
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalikes[cust_id])}
    for cust_id in lookalikes
])
lookalike_df.to_csv("Lookalike.csv", index=False)