In [43]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

In [66]:
# combine data of 3 datasets provide

df = pd.read_csv("final_data.csv")

In [45]:
df.drop(columns= ['Unnamed: 0'] , inplace=True)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   CustomerName     1000 non-null   object 
 7   Region           1000 non-null   object 
 8   SignupDate       1000 non-null   object 
 9   ProductName      1000 non-null   object 
 10  Category         1000 non-null   object 
 11  Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [47]:
df["TransactionDate"] = pd.to_datetime(df["TransactionDate"])
df["SignupDate"] = pd.to_datetime(df["SignupDate"])

In [48]:
df.sample(3)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Price
199,T00814,C0093,P038,2024-09-08 16:26:03,1,147.22,Nancy Walker,Asia,2024-07-13,TechPro Vase,Home Decor,147.22
969,T00578,C0143,P026,2024-03-30 23:37:50,3,193.14,Brian Parker,Asia,2024-05-27,SoundWave Bluetooth Speaker,Electronics,64.38
148,T00873,C0086,P054,2024-08-02 11:13:00,4,229.2,Stephanie Peterson,Europe,2022-09-18,SoundWave Cookbook,Books,57.3


In [49]:
customer_spending = df.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spending.rename(columns={"TotalValue": "TotalSpending"}, inplace=True)

In [50]:
total_transactions = df.groupby("CustomerID")["TransactionID"].count().reset_index()
total_transactions.rename(columns={"TransactionID": "TransactionCount"}, inplace=True)

In [51]:
avg_quantity = df.groupby("CustomerID")["Quantity"].mean().reset_index()
avg_quantity.rename(columns={"Quantity": "AvgQuantity"}, inplace=True)


customers["SignupYear"] = pd.to_datetime(customers["SignupDate"]).dt.year
customers["SignupMonth"] = pd.to_datetime(customers["SignupDate"]).dt.month

signup_info = customers[["CustomerID", "SignupYear", "SignupMonth"]]

num_product_types = df.groupby("CustomerID")["ProductID"].nunique().reset_index()
num_product_types.rename(columns={"ProductID": "NumProductTypes"}, inplace=True)

In [52]:
customer_features = pd.merge(customer_spending, total_transactions, on="CustomerID")
customer_features = pd.merge(customer_features, avg_quantity, on="CustomerID")
customer_features = pd.merge(customer_features, signup_info, on="CustomerID")
customer_features = pd.merge(customer_features, num_product_types, on="CustomerID")


In [53]:
customer_region = customers[["CustomerID", "Region"]]
customer_features = pd.merge(customer_features, customer_region, on="CustomerID")

In [54]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,TransactionCount,AvgQuantity,SignupYear,SignupMonth,NumProductTypes,Region
0,C0001,3354.52,5,2.400000,2022,7,5,South America
1,C0002,1862.74,4,2.500000,2022,2,4,Asia
2,C0003,2725.38,4,3.500000,2024,3,4,South America
3,C0004,5354.88,8,2.875000,2022,10,8,South America
4,C0005,2034.24,3,2.333333,2022,8,3,Asia
...,...,...,...,...,...,...,...,...
194,C0196,4982.88,4,3.000000,2022,6,3,Europe
195,C0197,1928.65,3,3.000000,2023,3,3,Europe
196,C0198,931.83,2,1.500000,2022,2,2,Europe
197,C0199,1979.28,4,2.250000,2022,12,4,Europe


In [55]:
categorical_features = ["Region" ]
numerical_features = ["TotalSpending", "AvgQuantity", "SignupYear" ,'SignupMonth',  'TransactionCount' , 'NumProductTypes']

In [56]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [57]:
transformed_features = preprocessor.fit_transform(customer_features)
customer_vectors = pd.DataFrame(transformed_features, index=customer_features["CustomerID"])

In [58]:
customer_vectors

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C0001,-0.061701,-0.233464,-1.273629,0.228409,-0.011458,0.050047,0.0,0.0,0.0,1.0
C0002,-0.877744,-0.054969,-1.273629,-1.209990,-0.467494,-0.424204,1.0,0.0,0.0,0.0
C0003,-0.405857,1.729980,1.095083,-0.922310,-0.467494,-0.424204,0.0,0.0,0.0,1.0
C0004,1.032547,0.614387,-1.273629,1.091449,1.356650,1.472798,0.0,0.0,0.0,1.0
C0005,-0.783929,-0.352460,-1.273629,0.516089,-0.923530,-0.898455,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
C0196,0.829053,0.837506,-1.273629,-0.059271,-0.467494,-0.898455,0.0,1.0,0.0,0.0
C0197,-0.841689,0.837506,-0.089273,-0.922310,-0.923530,-0.898455,0.0,1.0,0.0,0.0
C0198,-1.386975,-1.839918,-1.273629,-1.209990,-1.379566,-1.372705,0.0,1.0,0.0,0.0
C0199,-0.813993,-0.501206,-1.273629,1.666808,-0.467494,-0.424204,0.0,1.0,0.0,0.0


In [59]:
similarity_matrix = cosine_similarity(customer_vectors)


customer_id_map = {cust: idx for idx, cust in enumerate(customer_features["CustomerID"]) }

In [60]:

lookalike_results = {}
target_customers = customer_features["CustomerID"].iloc[:20]

for cust in target_customers:
    idx = customer_id_map[cust]
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4] 
    similar_customers = [(customer_features.iloc[i]["CustomerID"], similarity_matrix[idx, i]) for i in similar_indices]
    lookalike_results[cust] = similar_customers

In [61]:

lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df


Unnamed: 0,0,1,2
C0001,"(C0174, 0.8422926438420305)","(C0152, 0.8249537717448658)","(C0192, 0.8150921328659948)"
C0002,"(C0134, 0.80050676295401)","(C0007, 0.7986741261450693)","(C0193, 0.7880561700709434)"
C0003,"(C0129, 0.8914535159332343)","(C0006, 0.8559208047255501)","(C0190, 0.8268704351231548)"
C0004,"(C0102, 0.9623529275369861)","(C0108, 0.911317902819873)","(C0113, 0.9092908602795724)"
C0005,"(C0159, 0.9662117983915872)","(C0007, 0.9239256125937739)","(C0186, 0.8276087947128055)"
C0006,"(C0003, 0.8559208047255501)","(C0052, 0.8140053744677647)","(C0129, 0.7975111871903039)"
C0007,"(C0159, 0.9601003552901002)","(C0005, 0.9239256125937739)","(C0040, 0.8054008519467992)"
C0008,"(C0024, 0.9160288723585857)","(C0194, 0.892218123381115)","(C0047, 0.8775586950186819)"
C0009,"(C0121, 0.8922158648939775)","(C0119, 0.8875768899499024)","(C0097, 0.8765826593367202)"
C0010,"(C0062, 0.9171851013678582)","(C0086, 0.8758326917927979)","(C0199, 0.8727919280777416)"


In [62]:
lookalike_df = pd.DataFrame.from_dict({k: [v for v in lookalike_results[k]] for k in lookalike_results.keys()}, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

In [63]:
lookalike_df.to_csv("Lookalike.csv", index_label="CustomerID")

In [64]:
lookalike_df

Unnamed: 0,Lookalike1,Lookalike2,Lookalike3
C0001,"(C0174, 0.8422926438420305)","(C0152, 0.8249537717448658)","(C0192, 0.8150921328659948)"
C0002,"(C0134, 0.80050676295401)","(C0007, 0.7986741261450693)","(C0193, 0.7880561700709434)"
C0003,"(C0129, 0.8914535159332343)","(C0006, 0.8559208047255501)","(C0190, 0.8268704351231548)"
C0004,"(C0102, 0.9623529275369861)","(C0108, 0.911317902819873)","(C0113, 0.9092908602795724)"
C0005,"(C0159, 0.9662117983915872)","(C0007, 0.9239256125937739)","(C0186, 0.8276087947128055)"
C0006,"(C0003, 0.8559208047255501)","(C0052, 0.8140053744677647)","(C0129, 0.7975111871903039)"
C0007,"(C0159, 0.9601003552901002)","(C0005, 0.9239256125937739)","(C0040, 0.8054008519467992)"
C0008,"(C0024, 0.9160288723585857)","(C0194, 0.892218123381115)","(C0047, 0.8775586950186819)"
C0009,"(C0121, 0.8922158648939775)","(C0119, 0.8875768899499024)","(C0097, 0.8765826593367202)"
C0010,"(C0062, 0.9171851013678582)","(C0086, 0.8758326917927979)","(C0199, 0.8727919280777416)"


In [65]:
def get_similar_customers(customer_id):
    if customer_id not in customer_id_map:
        return f"CustomerID {customer_id} not found."
    idx = customer_id_map[customer_id]
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    similar_customers = [(customer_features.iloc[i]["CustomerID"], similarity_matrix[idx, i]) for i in similar_indices]
    return similar_customers

customer_id_input = "C0001"
result = get_similar_customers(customer_id_input)
print(f"Top 3 similar customers for {customer_id_input}: {result}")

Top 3 similar customers for C0001: [('C0174', 0.8422926438420305), ('C0152', 0.8249537717448658), ('C0192', 0.8150921328659948)]
