In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [40]:
# prepare data for similarity calculations
customer_agg = data.groupby('CustomerID').agg({
    'TotalValue' : 'sum',
    'Quantity' : 'sum',
    'Price_x' : 'mean',
    'Price_y' : 'mean'
}).reset_index()

In [42]:
# add customer profile features
customer_features = pd.merge(customer_agg, customers, on='CustomerID')

In [44]:
# encode categorical features(region )
customer_features = pd.get_dummies(customer_features, columns=['Region'])

In [66]:
print(customer_features.dtypes)

CustomerID               object
TotalValue              float64
Quantity                  int64
Price_x                 float64
Price_y                 float64
CustomerName             object
SignupDate               object
Region_Asia                bool
Region_Europe              bool
Region_North America       bool
Region_South America       bool
dtype: object


In [68]:
numerical_features = customer_features.select_dtypes(include=['float64', 'int64'])
print(numerical_features.head())

   TotalValue  Quantity     Price_x     Price_y
0     3354.52        12  278.334000  278.334000
1     1862.74        10  208.920000  208.920000
2     2725.38        14  195.707500  195.707500
3     5354.88        23  240.636250  240.636250
4     2034.24         7  291.603333  291.603333


In [72]:
#normalize numerical features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(numerical_features)

#if you need to combine with original columns, convert back to a Dataframe
normalized_df = pd.DataFrame(normalized_features, columns=numerical_features.columns)
print(normalized_df.head())

   TotalValue  Quantity   Price_x   Price_y
0   -0.061701 -0.122033  0.094670  0.094670
1   -0.877744 -0.448000 -0.904016 -0.904016
2   -0.405857  0.203934 -1.094109 -1.094109
3    1.032547  1.670787 -0.447702 -0.447702
4   -0.783929 -0.936951  0.285581  0.285581


In [74]:
# calculate similarity
similarity_matrix = cosine_similarity(normalized_features)

In [76]:
# Find top 3 similar customers
lookalike = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalike[customer_id] = [(customer_features['CustomerID'][i], round(score, 2)) for i, score in scores]

In [78]:
# save to lookalike.csv
import csv
with open('FirstName_LastName_Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["CustomerID", "Lookalikes"])
    for key, value in lookalike.items():
        writer.writerow([key, value])

print("Lookalike recommendations saved successfully!")

Lookalike recommendations saved successfully!
