In [1]:
# STEP-1:Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# STEP-2:Loading the datasets
customers=pd.read_csv("Customers.csv")
transactions=pd.read_csv("Transactions.csv")
products=pd.read_csv("Products.csv")

# STEP-3:Data Cleaning and Preprocessing
# Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [3]:
#STEP 4: Create a feature set for similarity measurement.
# Aggregate customer transaction behavior
customer_spending = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean")
).reset_index()

# Merge with customer profile data
customer_data = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

# Encode categorical variables (Region)
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numeric_features = ['total_spent', 'total_transactions', 'avg_transaction_value']
customer_data[numeric_features] = scaler.fit_transform(customer_data[numeric_features])

# Display processed customer data
print(customer_data.head())


  CustomerID        CustomerName SignupDate  total_spent  total_transactions  \
0      C0001    Lawrence Carroll 2022-07-10    -0.051884            0.000000   
1      C0002      Elizabeth Lutz 2022-02-13    -0.862714           -0.451294   
2      C0003      Michael Rivera 2024-03-07    -0.393842           -0.451294   
3      C0004  Kathleen Rodriguez 2022-10-09     1.035375            1.353881   
4      C0005         Laura Weber 2022-08-15    -0.769499           -0.902587   

   avg_transaction_value  Region_Europe  Region_North America  \
0              -0.054781          False                 False   
1              -0.903985          False                 False   
2              -0.011575          False                 False   
3              -0.061170          False                 False   
4              -0.025086          False                 False   

   Region_South America  
0                  True  
1                 False  
2                  True  
3                  True 

In [4]:
#STEP 5:- Compute Similarity Using Cosine Similarity
# Compute similarity matrix
customer_features = customer_data.drop(columns=["CustomerID", "CustomerName", "SignupDate"])
similarity_matrix = cosine_similarity(customer_features)

# Convert to DataFrame
customer_similarity_df = pd.DataFrame(similarity_matrix, index=customer_data["CustomerID"], columns=customer_data["CustomerID"])

# Display similarity matrix
print(customer_similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.070763  0.873427  0.478965  0.034713  0.459430   
C0002       0.070763  1.000000  0.357643 -0.551623  0.694007 -0.603602   
C0003       0.873427  0.357643  1.000000 -0.007838  0.513888  0.450871   
C0004       0.478965 -0.551623 -0.007838  1.000000 -0.860032  0.190968   
C0005       0.034713  0.694007  0.513888 -0.860032  1.000000  0.018868   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.012195  0.012841  0.063393  0.059410  ...  0.946153  0.717895   
C0002       0.094835 -0.122202  0.914656  0.828265  ...  0.365579  0.740137   
C0003       0.400945 -0.372159  0.332784  0.281288  ...  0.868295  0.864273   
C0004      -0.709482  0.665121 -0.516871 -0.430085  ...  0.356734 -0.116387   
C0005  

In [6]:
#STEP 5:- Find Top 3 Lookalike Customers for Each Customer (C0001 - C0020)
# Get the first 20 customers
target_customers = customer_data["CustomerID"][:20]

lookalike_results = {}

for customer_id in target_customers:
    # Get top 3 most similar customers
    similar_customers = customer_similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    
    # Store results
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert to DataFrame
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalike_Customers": str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

# Save as CSV
lookalike_df.to_csv("Dewanshi_Gupta_Lookalike.csv", index=False)


print(lookalike_df.head())

  CustomerID                                Lookalike_Customers
0      C0001  [('C0137', 0.9997668602684258), ('C0152', 0.99...
1      C0002  [('C0043', 0.9925855200738234), ('C0142', 0.98...
2      C0003  [('C0133', 0.9872238754286274), ('C0052', 0.97...
3      C0004  [('C0108', 0.983016487178466), ('C0113', 0.979...
4      C0005  [('C0178', 0.9993968050717345), ('C0159', 0.99...
