In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [9]:
# Aggregating transaction data
customer_features = merged_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    AvgOrderValue=('TotalValue', 'mean')
).reset_index()

In [15]:
region_encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
region_encoded = pd.DataFrame(region_encoder.fit_transform(customers[['Region']]),
                               columns=region_encoder.get_feature_names_out(['Region']))
region_encoded['CustomerID'] = customers['CustomerID']

In [17]:
customer_features = customer_features.merge(region_encoded, on='CustomerID')

In [19]:
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'TotalQuantity', 'AvgOrderValue']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [21]:
customer_ids = customer_features['CustomerID']
feature_matrix = customer_features.drop('CustomerID', axis=1)

similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

In [23]:
def get_top_3_similar(customer_id, similarity_df):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

lookalike_results = []
for customer_id in customer_ids:
    similar_customers = get_top_3_similar(customer_id, similarity_df)
    for similar_customer_id, score in similar_customers:
        lookalike_results.append([customer_id, similar_customer_id, score])

In [25]:
lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'.")

Lookalike recommendations saved to 'Lookalike.csv'.


In [27]:
Lookalike = pd.read_csv("Lookalike.csv")

In [29]:
print(lookalike_df.head())

  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0137         0.987783
1      C0001             C0107         0.964679
2      C0001             C0152         0.955025
3      C0002             C0088         0.991699
4      C0002             C0142         0.975260


In [54]:
print(lookalike_df.tail())

    CustomerID SimilarCustomerID  SimilarityScore
592      C0199             C0103         0.968654
593      C0199             C0172         0.954687
594      C0200             C0138         0.977982
595      C0200             C0022         0.913937
596      C0200             C0143         0.879663


In [56]:
print(lookalike_df.head(596
                       ))

    CustomerID SimilarCustomerID  SimilarityScore
0        C0001             C0137         0.987783
1        C0001             C0107         0.964679
2        C0001             C0152         0.955025
3        C0002             C0088         0.991699
4        C0002             C0142         0.975260
..         ...               ...              ...
591      C0199             C0062         0.981794
592      C0199             C0103         0.968654
593      C0199             C0172         0.954687
594      C0200             C0138         0.977982
595      C0200             C0022         0.913937

[596 rows x 3 columns]


In [58]:
Lookalike.to_csv("Lookalike.csv", index=False) 

In [62]:
from IPython.display import FileLink
FileLink("Lookalike.csv")