In [45]:
import pandas as pd

# Load datasets
customers = pd.read_csv("C:\\Users\\dhara\\Downloads\\Customers.csv")
products = pd.read_csv("C:\\Users\\dhara\\Downloads\\Products.csv")
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [47]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [52]:
# Calculate the average product price
average_price = products['Price'].mean()


# Create a new column for the average price in the customers DataFrame
customers['AvgPrice'] = average_price


In [54]:
# Convert SignupDate to datetime format for further analysis
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

# Extract year and month as separate features
customers['SignupYear'] = customers['SignupDate'].dt.year
customers['SignupMonth'] = customers['SignupDate'].dt.month

# Drop unnecessary columns
customer_data = customers.drop(columns=['CustomerName', 'SignupDate'])
customer_data.head()

Unnamed: 0,CustomerID,Region,AvgPrice,SignupYear,SignupMonth
0,C0001,South America,267.5517,2022,7
1,C0002,Asia,267.5517,2022,2
2,C0003,South America,267.5517,2024,3
3,C0004,South America,267.5517,2022,10
4,C0005,Asia,267.5517,2022,8


In [57]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Select features for similarity calculation
features = ['Region', 'SignupYear', 'SignupMonth', 'AvgPrice']
customer_data_encoded = pd.get_dummies(customer_data[features])

scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data_encoded)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_data_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.001563,0.328850,0.937874,0.066406,0.270946,0.044607,-0.497221,-0.256208,0.090869,...,0.356478,0.970205,-0.279181,-0.534632,0.411694,0.038388,-0.304893,-0.008835,0.090869,-0.267505
C0002,-0.001563,1.000000,-0.257005,-0.155085,0.790140,-0.131612,0.908872,-0.106750,-0.313876,-0.209236,...,-0.325583,-0.108786,0.902640,-0.337012,-0.597378,0.064883,-0.006997,0.247840,-0.209236,0.800868
C0003,0.328850,-0.257005,1.000000,0.157542,-0.553434,0.979282,-0.471995,0.188194,-0.413319,-0.647807,...,0.992927,0.213696,-0.059809,0.028211,0.717575,-0.499776,-0.126706,-0.272620,-0.647807,-0.285525
C0004,0.937874,-0.155085,0.157542,1.000000,0.130409,0.043414,0.031170,-0.635989,-0.145307,0.281931,...,0.221427,0.993997,-0.425385,-0.534179,0.494349,0.025168,-0.421801,-0.166831,0.281931,-0.252208
C0005,0.066406,0.790140,-0.553434,0.130409,1.000000,-0.537703,0.973773,-0.482537,-0.173917,0.163069,...,-0.552020,0.112000,0.645565,-0.468257,-0.411798,0.052190,-0.294403,-0.040639,0.163069,0.852972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0.038388,0.064883,-0.499776,0.025168,0.052190,-0.439250,0.059598,-0.390009,0.832416,0.803528,...,-0.525656,0.029678,-0.187798,-0.470082,-0.531182,1.000000,0.797300,0.902295,0.803528,-0.231057
C0197,-0.304893,-0.006997,-0.126706,-0.421801,-0.294403,-0.020098,-0.202929,0.000683,0.775787,0.403490,...,-0.186776,-0.390670,-0.027472,-0.201175,-0.448558,0.797300,1.000000,0.891165,0.403490,-0.259181
C0198,-0.008835,0.247840,-0.272620,-0.166831,-0.040639,-0.142243,0.064347,-0.116102,0.634854,0.468389,...,-0.343820,-0.119280,0.047039,-0.355112,-0.624612,0.902295,0.891165,1.000000,0.468389,-0.192624
C0199,0.090869,-0.209236,-0.647807,0.281931,0.163069,-0.703785,0.033292,-0.638966,0.829355,1.000000,...,-0.602535,0.225519,-0.449818,-0.473052,-0.226157,0.803528,0.403490,0.468389,1.000000,-0.207557


In [58]:
def recommend_similar_customers(customer_id, num_recommendations=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:num_recommendations+1]
    return list(similar_customers.index), list(similar_customers.values)

# Generate recommendations for the first 20 customers
recommendations = {}
for customer_id in customer_data['CustomerID'][:20]:
    recommended_customers, scores = recommend_similar_customers(customer_id)
    recommendations[customer_id] = list(zip(recommended_customers, scores))


In [61]:
# Convert recommendations to the required format for Lookalike.csv
lookalike_records = []
for cust_id, lookalikes in recommendations.items():
    lookalike_records.append({'cust_id': cust_id, 'lookalikes': lookalikes})

# Create DataFrame
lookalike_df = pd.DataFrame(lookalike_records)

# Save to CSV
lookalike_df.to_csv('Downloads\\Lookalike.csv', index=False)
