In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

transactions_path = 'Transactions.csv'
products_path = 'Products.csv'
customers_path = 'Customers.csv'

transactions_df = pd.read_csv(transactions_path)
products_df = pd.read_csv(products_path)
customers_df = pd.read_csv(customers_path)

# Renaming the Price column in products_df to avoid conflicts
products_df = products_df.rename(columns={'Price': 'ProductPrice'})

# Dataset Merger
merged_df = transactions_df.merge(products_df, on='ProductID', how='left')
merged_df = merged_df.merge(customers_df, on='CustomerID', how='left')

merged_df['Revenue'] = merged_df['Quantity'] * merged_df['ProductPrice']

# Customer-product matrix creation
customer_product_matrix = merged_df.pivot_table(index='CustomerID', 
                                                columns='ProductID', 
                                                values='Revenue', 
                                                aggfunc='sum', 
                                                fill_value=0)


scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

# Cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_matrix)

# DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_product_matrix.index, 
                             columns=customer_product_matrix.index)

# Function to get top N similar customers for a given customer ID
def get_top_n_similar(customers_similarity, customer_id, n=3):
    similar_customers = customers_similarity[customer_id].sort_values(ascending=False).iloc[1:n+1]
    return [(cust_id, round(score, 2)) for cust_id, score in similar_customers.items()]

# lookalike recommendations for the first 20 customers
lookalike_recommendations = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_recommendations[customer_id] = get_top_n_similar(similarity_df, customer_id)

# Create Lookalike.csv
lookalike_data = []
for cust_id, recommendations in lookalike_recommendations.items():
    for rec_cust_id, score in recommendations:
        lookalike_data.append({'customer_id': cust_id, 'recommended_customer_id': rec_cust_id, 'score': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Top 3 Lookalikes with Similarity Scores for First 20 Customers:")
print(lookalike_df.head(60))

Top 3 Lookalikes with Similarity Scores for First 20 Customers:
   customer_id recommended_customer_id  score
0        C0001                   C0194   0.40
1        C0001                   C0104   0.37
2        C0001                   C0020   0.37
3        C0002                   C0030   0.40
4        C0002                   C0091   0.38
5        C0002                   C0071   0.32
6        C0003                   C0181   0.48
7        C0003                   C0134   0.47
8        C0003                   C0144   0.42
9        C0004                   C0070   0.35
10       C0004                   C0175   0.32
11       C0004                   C0132   0.28
12       C0005                   C0096   0.49
13       C0005                   C0023   0.47
14       C0005                   C0055   0.38
15       C0006                   C0040   0.49
16       C0006                   C0178   0.40
17       C0006                   C0058   0.31
18       C0007                   C0079   0.62
19       C0007  