In [76]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [78]:
# Load datasets
customers = pd.read_csv("C:/Users/danis/Downloads/Customers.csv")
products = pd.read_csv("C:/Users/danis/Downloads/Products.csv")
transactions = pd.read_csv("C:/Users/danis/Downloads/Transactions.csv")

In [80]:
# Merge datasets
transactions = transactions.merge(products, on='ProductID', how='left')
customer_transactions = transactions.groupby('CustomerID').agg({
    'Category': lambda x: ' '.join(set(x)),  # Combine unique product categories for each customer
    'TotalValue': 'sum',                     # Total spending by customer
    'TransactionID': 'count'                 # Number of transactions
}).reset_index()

In [82]:
# Merge customer profiles with customer data
customer_data = customers.merge(customer_transactions, on='CustomerID', how='left').fillna({
    'Category': '', 'TotalValue': 0, 'TransactionID': 0
})

In [84]:
# Encode product categories and region
category_encoded = customer_data['Category'].str.get_dummies(sep=' ')
region_encoded = pd.get_dummies(customer_data['Region'], prefix="region")


In [86]:
# Combine all features
features = pd.concat([
    customer_data[['TotalValue', 'TransactionID']],
    category_encoded,
    region_encoded
], axis=1)


In [88]:
# Normalize numerical features
scaler = StandardScaler()
features[['TotalValue', 'TransactionID']] = scaler.fit_transform(features[['TotalValue', 'TransactionID']])

In [90]:
# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

In [92]:
# Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_data['CustomerID'][:20]):  # Limit to the first 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalike_results[customer_id] = [(sim_cust, round(score, 4)) for sim_cust, score in similar_customers.items()]

In [94]:
# Save results to CSV in required format
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
    })
csv_file_path = 'Lookalike.csv'
lookalike_df.to_csv(csv_file_path, index=False)


In [96]:
print("Lookalike.csv has been generated successfully.")
print(lookalike_df.head())

Lookalike.csv has been generated successfully.
  CustomerID                                         Lookalikes
0      C0001  [('C0152', 1.0), ('C0174', 0.9938), ('C0085', ...
1      C0002  [('C0159', 0.9799), ('C0134', 0.9595), ('C0043...
2      C0003  [('C0031', 0.9871), ('C0129', 0.981), ('C0158'...
3      C0004  [('C0012', 0.9889), ('C0102', 0.9403), ('C0113...
4      C0005  [('C0007', 0.9922), ('C0140', 0.987), ('C0177'...


In [98]:
# Evaluate Model Accuracy using Mean Average Similarity (MAS)
def mean_average_similarity(similarity_df, lookalike_results):
    total_similarity = 0
    count = 0
    for customer_id, lookalikes in lookalike_results.items():
        for sim_cust, score in lookalikes:
            total_similarity += score
            count += 1
    return round(total_similarity / count, 4) if count > 0 else 0

accuracy_score = mean_average_similarity(similarity_df, lookalike_results)
print(f"Model Accuracy (Mean Average Similarity): {accuracy_score}")

Model Accuracy (Mean Average Similarity): 0.9301
