In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
customers = pd.read_csv(r"C:\Users\Ayushi singh\Downloads\Untitled Folder\Customers.csv")
products = pd.read_csv(r"C:\Users\Ayushi singh\Downloads\Untitled Folder\Products.csv")
transactions = pd.read_csv(r"C:\Users\Ayushi singh\Downloads\Untitled Folder\Transactions.csv")

In [3]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [4]:
print(data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [5]:
customer_profiles = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "TransactionID": "count",  # Total transactions
    "Category": lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

In [6]:
customer_profiles.rename(columns={
    "TotalValue": "TotalSpent",
    "TransactionID": "TransactionCount"
}, inplace=True)

print(customer_profiles.head())

  CustomerID  TotalSpent  TransactionCount     Category
0      C0001     3354.52                 5  Electronics
1      C0002     1862.74                 4     Clothing
2      C0003     2725.38                 4   Home Decor
3      C0004     5354.88                 8        Books
4      C0005     2034.24                 3  Electronics


In [7]:
scaler = MinMaxScaler()
customer_profiles[["TotalSpent", "TransactionCount"]] = scaler.fit_transform(
    customer_profiles[["TotalSpent", "TransactionCount"]]
)

print(customer_profiles.head())

  CustomerID  TotalSpent  TransactionCount     Category
0      C0001    0.308942               0.4  Electronics
1      C0002    0.168095               0.3     Clothing
2      C0003    0.249541               0.3   Home Decor
3      C0004    0.497806               0.7        Books
4      C0005    0.184287               0.2  Electronics


In [9]:
category_encoded = pd.get_dummies(customer_profiles["Category"], prefix="Category")

In [10]:
customer_features = pd.concat(
    [customer_profiles[["TotalSpent", "TransactionCount"]], category_encoded], axis=1
)

In [11]:

similarity_matrix = cosine_similarity(customer_features)


similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_profiles["CustomerID"],
    columns=customer_profiles["CustomerID"]
)


print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.145106  0.163869  0.293686  0.979134  0.192838   
C0002       0.145106  1.000000  0.116239  0.210669  0.083018  0.132129   
C0003       0.163869  0.116239  1.000000  0.236188  0.095276  0.156797   
C0004       0.293686  0.210669  0.236188  1.000000  0.169631  0.955776   
C0005       0.979134  0.083018  0.095276  0.169631  1.000000  0.114347   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.982985  0.306891  0.090538  0.141813  ...  0.196733  0.991129   
C0002       0.090016  0.226911  0.992088  0.999918  ...  0.141443  0.108389   
C0003       0.105770  0.909265  0.072127  0.113450  ...  0.158128  0.120218   
C0004       0.186531  0.447311  0.132226  0.206182  ...  0.966961  0.217065   
C0005  

In [12]:
top_lookalikes = {}

for customer in customer_profiles["CustomerID"][:20]:  # First 20 customers
    # Get similarity scores for the customer
    scores = similarity_df[customer].sort_values(ascending=False)
    # Exclude the customer itself and get the top 3 similar customers
    top_3 = scores.iloc[1:4].reset_index()
    top_3.columns = ["LookalikeCustomerID", "Score"]
    top_lookalikes[customer] = top_3.values.tolist()

# Preview results
for key, value in top_lookalikes.items():
    print(f"Customer {key}: {value}")

Customer C0001: [['C0072', 0.9995801239756656], ['C0190', 0.9995396754148373], ['C0069', 0.9992414536172233]]
Customer C0002: [['C0029', 0.9999769937828287], ['C0010', 0.9999177738763096], ['C0111', 0.9961301024459819]]
Customer C0003: [['C0178', 0.9999989970925096], ['C0052', 0.999791880110156], ['C0166', 0.9996205032237228]]
Customer C0004: [['C0021', 0.9999907303388264], ['C0101', 0.9999163269540592], ['C0173', 0.9995595762227322]]
Customer C0005: [['C0112', 0.9999775023325984], ['C0197', 0.9999550413378965], ['C0186', 0.9996988311929792]]
Customer C0006: [['C0117', 0.999873668447618], ['C0168', 0.9979617911305342], ['C0187', 0.9963642506153775]]
Customer C0007: [['C0120', 0.9999532375535115], ['C0050', 0.9990958497332864], ['C0115', 0.9988291289227257]]
Customer C0008: [['C0113', 0.9911961983046282], ['C0124', 0.9824122021273818], ['C0065', 0.9794629051971273]]
Customer C0009: [['C0077', 0.9999851375384464], ['C0083', 0.9997264144050755], ['C0062', 0.9976924430093821]]
Customer C00

In [13]:
lookalike_data = []

for cust_id, lookalikes in top_lookalikes.items():
    # Convert the lookalike list to the required format
    lookalike_data.append({
        "cust_id": cust_id,
        "lookalikes": str(lookalikes)  # Convert list to string for CSV format
    })

# Create DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save as Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv created successfully!")

Lookalike.csv created successfully!


In [14]:
import os
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\Ayushi singh
