In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [10]:
# Merge the data
customer_transactions = transactions.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
customer_transactions = customer_transactions.merge(products[['ProductID', 'ProductName', 'Category']], on='ProductID', how='left')

In [11]:
# combine data set
customer_transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Region,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Asia,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,South America,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics


In [12]:
# Aggregate the transaction data for each customer
customer_summary = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'nunique'),
    most_common_category=('Category', lambda x: x.mode()[0])
).reset_index()


In [28]:
# Merge the aggregated features with the customer profile data
customer_profile = customers[['CustomerID', 'Region']]
customer_data = customer_profile.merge(customer_summary, on='CustomerID')

customer_data.head(20)

Unnamed: 0,CustomerID,Region,total_spent,total_transactions,most_common_category
0,C0001,South America,3354.52,5,Electronics
1,C0002,Asia,1862.74,4,Clothing
2,C0003,South America,2725.38,4,Home Decor
3,C0004,South America,5354.88,8,Books
4,C0005,Asia,2034.24,3,Electronics
5,C0006,South America,4227.57,4,Books
6,C0007,Asia,2579.82,3,Electronics
7,C0008,North America,4271.61,10,Home Decor
8,C0009,Europe,896.5,3,Clothing
9,C0010,Europe,1717.55,4,Clothing


Similarity Calculation

In [14]:
# Standardize the numeric features
scaler = StandardScaler()
customer_data[['total_spent', 'total_transactions']] = scaler.fit_transform(
    customer_data[['total_spent', 'total_transactions']]
)

In [15]:
# Encoding categorical 'Region' as numeric
customer_data['Region'] = customer_data['Region'].astype('category').cat.codes

# Feature columns: Region, total_spent, total_transactions
features = customer_data[['Region', 'total_spent', 'total_transactions']]

In [17]:
# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Preview the similarity matrix
similarity_matrix[:3, :3]  # Viewing the first  customers' similarity scores with all others

array([[1.        , 0.01994403, 0.98245697],
       [0.01994403, 1.        , 0.18868476],
       [0.98245697, 0.18868476, 1.        ]])

In [24]:
# Create a list of top 3 lookalikes for each customer
lookalike_dict = {}

for i, row in customer_data.iterrows():
    cust_id = row['CustomerID']
    
    # Get the similarity scores for this customer with all others
    similarity_scores = similarity_matrix[i]
    
    # Get the indices of the top 3 most similar customers (excluding the customer itself)
    top_3_indices = similarity_scores.argsort()[-4:-1]  # Exclude the customer itself
    
    # Map each customer to their top 3 lookalikes and similarity scores
    lookalikes = []
    for idx in top_3_indices:
        lookalikes.append(customer_data.iloc[idx]['CustomerID'])
        lookalikes.append(similarity_scores[idx])
    
    lookalike_dict[cust_id] = lookalikes

# Convert to DataFrame and save as CSV
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index')
lookalike_df.columns = ['Lookalike_1', 'Score_1', 'Lookalike_2', 'Score_2', 'Lookalike_3', 'Score_3']

# Save the result to 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv')


In [29]:
lookalike = pd.read_csv('Lookalike.csv')
lookalike.head(20)

Unnamed: 0,Customer_ID,Lookalike_1,Score_1,Lookalike_2,Score_2,Lookalike_3,Score_3
0,C0001,C0107,0.998794,C0152,0.999984,C0137,0.999992
1,C0002,C0177,0.971508,C0043,0.990709,C0142,0.992988
2,C0003,C0125,0.997232,C0052,0.999128,C0133,0.999601
3,C0004,C0102,0.993629,C0113,0.996468,C0122,0.997043
4,C0005,C0178,0.999757,C0123,0.999852,C0159,0.999912
5,C0006,C0079,0.994837,C0158,0.995839,C0117,0.997169
6,C0007,C0140,0.997154,C0193,0.999968,C0092,0.999998
7,C0008,C0017,0.965161,C0124,0.97955,C0109,0.981578
8,C0009,C0033,0.978022,C0198,0.982462,C0062,0.985893
9,C0010,C0166,0.973688,C0121,0.984562,C0199,0.996885


Evaluation:
*The accuracy of the model is based on the quality of the features used for similarity calculation and how well the customers' profiles match with similar customers.
*The recommendation quality depends on how well the lookalikes are aligned in terms of product preferences and transaction behavior.