In [3]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge transactions with customer data to get profile and transaction history in one dataset
transaction_summary = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',   # Total spend
    'TransactionID': 'count'  # Number of transactions
}).reset_index()

# Merge with customer data (to get regions)
customer_profile = customers[['CustomerID', 'Region']]
customer_data = pd.merge(transaction_summary, customer_profile, on='CustomerID')

# Feature Engineering:
# We'll use TotalValue, TransactionID, and Region (one-hot encode Region)
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# Standardize the features (normalize them)
scaler = StandardScaler()
customer_data[['TotalValue', 'TransactionID']] = scaler.fit_transform(customer_data[['TotalValue', 'TransactionID']])

# Cosine similarity calculation (ignoring CustomerID as it doesn't affect similarity)
similarity_matrix = cosine_similarity(customer_data.drop(columns=['CustomerID']))

# Function to get top 3 similar customers for a given customer ID
def get_top_3_similar(customers_df, similarity_matrix, customer_id):
    customer_idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    sim_scores = similarity_matrix[customer_idx]

    # Get similar customers sorted by similarity score (highest to lowest)
    similar_customers = [(customers_df.iloc[i]['CustomerID'], sim_scores[i])
                         for i in range(len(sim_scores)) if customers_df.iloc[i]['CustomerID'] != customer_id]

    # Sort by similarity score and get top 3
    similar_customers_sorted = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]

    return similar_customers_sorted

# Get top 3 similar customers for CustomerID: C0001 to C0020
lookalike_recommendations = []
for customer_id in ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                    'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']:
    similar_customers = get_top_3_similar(customer_data, similarity_matrix, customer_id)
    lookalike_recommendations.append({
        'CustomerID': customer_id,
        'Lookalikes_Score': similar_customers
    })

# Convert the results into a DataFrame
lookalike_df = pd.DataFrame(lookalike_recommendations)

# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the lookalike recommendations
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalikes_Score
0,C0001,"[(C0137, 0.9999284514771006), (C0152, 0.999853..."
1,C0002,"[(C0142, 0.9929875591787781), (C0043, 0.990709..."
2,C0003,"[(C0133, 0.997474573690933), (C0052, 0.9950550..."
3,C0004,"[(C0113, 0.9904771529207443), (C0102, 0.986668..."
4,C0005,"[(C0159, 0.9999117144387663), (C0123, 0.999852..."
