In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime

In [2]:
df = pd.read_csv('Merged_Dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [3]:
df.drop(columns=['Price_x'], inplace=True)
df.rename(columns={'Price_y': 'Price'}, inplace=True)


In [4]:
df.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')

In [5]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['SignupDate'] = pd.to_datetime(df['SignupDate'])


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
selected_columns = ['CustomerID', 'Category', 'ProductName', 'Quantity']
df_selected = df[selected_columns]

In [8]:
# Convert categorical features to numerical using one-hot encoding
df_encoded = pd.get_dummies(df_selected, columns=['Category', 'ProductName'])

In [9]:
# Aggregate data by customer (sum quantities for each category and product)
df_grouped = df_encoded.groupby('CustomerID').sum().reset_index()

In [10]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_grouped.drop(columns=['CustomerID']))


In [11]:
# Convert scaled features back to DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=df_grouped.columns[1:])
scaled_df.insert(0, 'CustomerID', df_grouped['CustomerID'])

In [12]:
# Compute cosine similarity
customer_product_matrix = scaled_df.set_index('CustomerID')
similarity_matrix = cosine_similarity(customer_product_matrix)

In [13]:
# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_product_matrix.index, 
                             columns=customer_product_matrix.index)

In [14]:
def get_top_3_similar(customers, similarity_df):
    lookalike_dict = {}
    for cust in customers:
        similar_customers = similarity_df[cust].drop(cust).sort_values(ascending=False).head(3)
        rounded_similar_customers = [(cust_id, round(score, 2)) for cust_id, score in similar_customers.items()]
        lookalike_dict[cust] = rounded_similar_customers
    return lookalike_dict

In [15]:
first_20_customers = sorted(df_grouped['CustomerID'].unique())[:20]
top_3_lookalikes = get_top_3_similar(first_20_customers, similarity_df)

In [16]:
lookalike_records = []
for cust_id, recs in top_3_lookalikes.items():
    lookalike_records.append([cust_id, str([{rec[0]: rec[1]} for rec in recs])])

lookalike_df = pd.DataFrame(lookalike_records, columns=['CustomerID', 'Recommendations'])
lookalike_df.to_csv('Lookalike12.csv', index=False)

print("Lookalike12.csv file has been created successfully with top 3 recommendations.")

# Display sample results
print(lookalike_df.head(10))


Lookalike12.csv file has been created successfully with top 3 recommendations.
  CustomerID                                    Recommendations
0      C0001  [{'C0140': 0.57}, {'C0050': 0.48}, {'C0097': 0...
1      C0002  [{'C0030': 0.55}, {'C0008': 0.38}, {'C0109': 0...
2      C0003  [{'C0144': 0.58}, {'C0164': 0.52}, {'C0134': 0...
3      C0004  [{'C0065': 0.5}, {'C0182': 0.35}, {'C0133': 0.3}]
4      C0005  [{'C0096': 0.46}, {'C0119': 0.45}, {'C0107': 0...
5      C0006  [{'C0171': 0.44}, {'C0197': 0.3}, {'C0139': 0.3}]
6      C0007  [{'C0020': 0.55}, {'C0031': 0.52}, {'C0181': 0...
7      C0008  [{'C0164': 0.4}, {'C0002': 0.38}, {'C0091': 0....
8      C0009  [{'C0062': 0.56}, {'C0083': 0.42}, {'C0056': 0...
9      C0010  [{'C0092': 0.55}, {'C0077': 0.46}, {'C0083': 0...


In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('Merged_Dataset.csv')

# Select relevant columns for analysis
selected_columns = ['CustomerID', 'Category', 'ProductName', 'Quantity', 'Region']
df_selected = df[selected_columns]

# Convert categorical features to numerical using one-hot encoding
df_encoded = pd.get_dummies(df_selected, columns=['Category', 'ProductName', 'Region'])

# Aggregate data by customer (sum quantities for each category, product, and region)
df_grouped = df_encoded.groupby('CustomerID').sum().reset_index()

# Standardize the data using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_grouped.drop(columns=['CustomerID']))

# Convert scaled features back to DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=df_grouped.columns[1:])
scaled_df.insert(0, 'CustomerID', df_grouped['CustomerID'])

# Compute cosine similarity
customer_product_matrix = scaled_df.set_index('CustomerID')
similarity_matrix = cosine_similarity(customer_product_matrix)

# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_product_matrix.index, 
                             columns=customer_product_matrix.index)

# Function to get top 3 similar customers for given CustomerIDs
def get_top_3_similar(customers, similarity_df):
    lookalike_dict = {}
    for cust in customers:
        similar_customers = similarity_df[cust].drop(cust).sort_values(ascending=False).head(3)
        rounded_similar_customers = [(cust_id, round(score, 2)) for cust_id, score in similar_customers.items()]
        lookalike_dict[cust] = rounded_similar_customers
    return lookalike_dict

# Get top 3 similar customers for the first 20 customers
first_20_customers = sorted(df_grouped['CustomerID'].unique())[:20]
top_3_lookalikes = get_top_3_similar(first_20_customers, similarity_df)

# Save recommendations to Lookalike.csv
lookalike_records = []
for cust_id, recs in top_3_lookalikes.items():
    lookalike_records.append([cust_id, str([{rec[0]: rec[1]} for rec in recs])])

lookalike_df = pd.DataFrame(lookalike_records, columns=['CustomerID', 'Recommendations'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv file has been created successfully with top 3 recommendations.")

# Display sample results
print(lookalike_df.head(10))


Lookalike.csv file has been created successfully with top 3 recommendations.
  CustomerID                                    Recommendations
0      C0001  [{'C0140': 0.53}, {'C0050': 0.44}, {'C0190': 0...
1      C0002  [{'C0030': 0.53}, {'C0008': 0.35}, {'C0109': 0...
2      C0003  [{'C0144': 0.55}, {'C0164': 0.47}, {'C0134': 0...
3      C0004  [{'C0065': 0.44}, {'C0133': 0.33}, {'C0082': 0...
4      C0005  [{'C0096': 0.43}, {'C0119': 0.42}, {'C0107': 0...
5      C0006  [{'C0171': 0.46}, {'C0197': 0.28}, {'C0178': 0...
6      C0007  [{'C0020': 0.54}, {'C0031': 0.48}, {'C0117': 0...
7      C0008  [{'C0164': 0.36}, {'C0091': 0.35}, {'C0002': 0...
8      C0009  [{'C0062': 0.58}, {'C0083': 0.4}, {'C0056': 0....
9      C0010  [{'C0092': 0.52}, {'C0077': 0.43}, {'C0083': 0...
