In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
#Load datasets
customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')

In [None]:
#Merge datasets
merged_df = (
    transactions_df
    .merge(customers_df, on="CustomerID", how="inner")
    .merge(products_df, on="ProductID", how="inner")
)

In [None]:
#Feature Engineering: Build Customer Profiles
customer_profiles = merged_df.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "nunique"),
    avg_transaction_value=("TotalValue", "mean"),
    recency=("TransactionDate", lambda x: (pd.Timestamp.now() - pd.to_datetime(x).max()).days),
    frequency=("TransactionDate", "count"),
    categories_bought=("Category", lambda x: ','.join(x.unique())),
    region=("Region", "first"),
    products_bought=("ProductID", lambda x: ','.join(x.unique()))
).reset_index()


In [None]:
#One-hot encode categories, products, and region for similarity calculation
categories_encoded = pd.get_dummies(customer_profiles['categories_bought'].str.split(',').explode()).groupby(level=0).sum()
products_encoded = pd.get_dummies(customer_profiles['products_bought'].str.split(',').explode()).groupby(level=0).sum()
region_encoded = pd.get_dummies(customer_profiles['region'])

# Combine features into a single dataframe
customer_features = pd.concat([
    customer_profiles[['total_spent', 'total_transactions', 'avg_transaction_value', 'recency', 'frequency']],
    categories_encoded,
    products_encoded,
    region_encoded
], axis=1)

In [None]:
#Normalize the features for similarity computation
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)

# Compute similarity matrix
similarity_matrix = cosine_similarity(normalized_features)

In [None]:
#Generate Lookalike recommendations for the first 20 customers
first_20_customers = customer_profiles[customer_profiles['CustomerID'].isin([f'C{str(i).zfill(4)}' for i in range(1, 21)])]

lookalike_map = []

for customer_id in first_20_customers['CustomerID']:
    #Get the index of the current customer in the similarity matrix
    customer_idx = customer_profiles.index[customer_profiles['CustomerID'] == customer_id][0]
    similarity_scores = similarity_matrix[customer_idx]

    #Get top 3 similar customers (excluding the input customer itself)
    similar_customers_idx = np.argsort(similarity_scores)[::-1][1:4]
    similar_customers = customer_profiles.iloc[similar_customers_idx]
    similar_customers['similarity_score'] = similarity_scores[similar_customers_idx]

    #Add to lookalike map
    for _, row in similar_customers.iterrows():
        lookalike_map.append({
            'cust_id': customer_id,
            'recommended_cust_id': row['CustomerID'],
            'score': row['similarity_score']
        })


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['similarity_score'] = similarity_scores[similar_customers_idx]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['similarity_score'] = similarity_scores[similar_customers_idx]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['similarity_score'] = sim

In [None]:
#Convert lookalike map to DataFrame
lookalike_df = pd.DataFrame(lookalike_map)
print(lookalike_df)

#Save to CSV
output_path = 'Aditya_Patil_Lookalike.csv'
lookalike_df.to_csv(output_path, index=False)

print()
print(f"Lookalike recommendations saved to {output_path}")


   cust_id recommended_cust_id     score
0    C0001               C0190  0.311246
1    C0001               C0154  0.254891
2    C0001               C0104  0.234517
3    C0002               C0109  0.360432
4    C0002               C0134  0.306205
5    C0002               C0071  0.296128
6    C0003               C0181  0.394981
7    C0003               C0134  0.353398
8    C0003               C0144  0.345211
9    C0004               C0053  0.337378
10   C0004               C0075  0.290650
11   C0004               C0065  0.267472
12   C0005               C0096  0.368890
13   C0005               C0119  0.287815
14   C0005               C0149  0.259673
15   C0006               C0171  0.457663
16   C0006               C0196  0.293947
17   C0006               C0058  0.258782
18   C0007               C0140  0.534186
19   C0007               C0020  0.435014
20   C0007               C0031  0.291241
21   C0008               C0091  0.288453
22   C0008               C0002  0.257701
23   C0008      