In [5]:
import pandas as pd

customers= pd.read_csv('Customers.csv')
products= pd.read_csv('Products.csv')
transactions= pd.read_csv('Transactions.csv')

In [7]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], errors='coerce')

# Merging datasets for unified analysis
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')



In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

encoded_region = pd.get_dummies(merged_data['Region'], prefix='Region')
encoded_category = pd.get_dummies(merged_data['Category'], prefix='Category')


customer_aggregated = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum', 
    'Quantity': 'sum',  
    'ProductID': 'nunique',  
    'Category': lambda x: list(x),  
    'Region': 'first'  
}).reset_index()

customer_aggregated = customer_aggregated.join(
    pd.get_dummies(customer_aggregated['Region'], prefix='Region')
)

# Normalize continuous features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_aggregated[['TotalValue', 'Quantity', 'ProductID']])

# Combine normalized and encoded features
customer_features = np.hstack([scaled_features, customer_aggregated.iloc[:, -len(encoded_region.columns):].values])

# Similarity Calculation
similarity_matrix = cosine_similarity(customer_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_aggregated['CustomerID'], columns=customer_aggregated['CustomerID'])

similarity_df


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.059104,0.822756,0.300925,0.064143,0.819048,0.048364,-0.017083,0.092856,0.035907,...,0.971943,0.721490,0.024950,0.007894,0.670809,-0.050740,0.047466,0.077026,0.068841,-0.078382
C0002,0.059104,1.000000,0.257595,-0.586339,0.934238,-0.076031,0.903554,-0.455603,0.631730,0.504359,...,0.210999,0.530843,0.905172,-0.334104,-0.403837,-0.125636,0.559048,0.632386,0.535057,0.061984
C0003,0.822756,0.257595,1.000000,0.095127,0.237625,0.716251,0.219636,-0.281808,0.212297,0.317732,...,0.811807,0.736122,0.164984,-0.277036,0.494672,0.010597,0.298886,0.257570,0.220077,-0.127144
C0004,0.300925,-0.586339,0.095127,1.000000,-0.768015,0.193780,-0.723583,0.777862,-0.815501,-0.471764,...,0.116952,-0.418751,-0.445778,0.532282,0.901288,-0.159815,-0.713150,-0.850998,-0.630038,0.477107
C0005,0.064143,0.934238,0.237625,-0.768015,1.000000,0.079761,0.986601,-0.686890,0.744802,0.472037,...,0.217362,0.610631,0.882529,-0.480359,-0.551894,0.094400,0.660573,0.774435,0.588654,-0.039256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.050740,-0.125636,0.010597,-0.159815,0.094400,0.399858,0.193868,-0.424452,0.209901,0.261763,...,-0.131312,-0.064456,0.114217,-0.323264,-0.138044,1.000000,0.439727,0.297774,0.331137,0.225071
C0197,0.047466,0.559048,0.298886,-0.713150,0.660573,0.053147,0.611262,-0.661276,0.927922,0.915687,...,0.189456,0.572862,0.391749,-0.491466,-0.499943,0.439727,1.000000,0.949798,0.964210,-0.424513
C0198,0.077026,0.632386,0.257570,-0.850998,0.774435,0.062960,0.712596,-0.738200,0.986263,0.788889,...,0.254959,0.689813,0.443766,-0.511163,-0.611216,0.297774,0.949798,1.000000,0.913474,-0.522917
C0199,0.068841,0.535057,0.220077,-0.630038,0.588654,-0.048258,0.516295,-0.486923,0.935158,0.940285,...,0.226743,0.556918,0.325804,-0.338126,-0.443847,0.331137,0.964210,0.913474,1.000000,-0.462934


In [16]:

def recommend_similar(customers_df, customer_id, similarity_matrix, top_n=3):
    customer_idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_idx]
    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
    
    recommendations = customers_df.iloc[similar_indices][['CustomerID']]
    recommendations['SimilarityScore'] = similarity_scores[similar_indices]
    return recommendations

top_lookalikes = []
for customer_id in customer_aggregated['CustomerID'][:20]:
    recommendations = recommend_similar(customer_aggregated, customer_id, similarity_matrix)
    top_lookalikes.append((customer_id, recommendations))

# for display
lookalike_results = []
for customer_id, recs in top_lookalikes:
    for _, row in recs.iterrows():
        lookalike_results.append({
            "CustomerID": customer_id,
            "LookalikeID": row['CustomerID'],
            "SimilarityScore": row['SimilarityScore']
        })
        
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.head(60)  # Display results for 20 customers, 3 lookalikes each

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0107,0.989387
1,C0001,C0137,0.987857
2,C0001,C0191,0.971943
3,C0002,C0142,0.988544
4,C0002,C0177,0.965028
5,C0002,C0027,0.948354
6,C0003,C0133,0.91032
7,C0003,C0190,0.907494
8,C0003,C0174,0.869426
9,C0004,C0113,0.988774
