In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from google.colab import files
uploaded = files.upload()

Saving Customers.csv to Customers (1).csv


In [5]:
from google.colab import files
uploaded = files.upload()

Saving Products.csv to Products.csv


In [6]:
from google.colab import files
uploaded = files.upload()

Saving Transactions.csv to Transactions (1).csv


In [7]:
# Load the datasets
customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')


In [8]:
# Check for missing values
print(customers_df.isnull().sum())
print(products_df.isnull().sum())
print(transactions_df.isnull().sum())

# Basic statistics
print(customers_df.describe())
print(products_df.describe())
print(transactions_df.describe())


CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64
       CustomerID      CustomerName         Region  SignupDate
count         200               200            200         200
unique        200               200              4         179
top         C0001  Lawrence Carroll  South America  2024-11-11
freq            1                 1             59           3
            Price
count  100.000000
mean   267.551700
std    143.219383
min     16.080000
25%    147.767500
50%    292.875000
75%    397.090000
max    497.760000
          Quantity   TotalValue       Price
count  1000.000000  1000.000000  1000.00000
mean      2.537000   689.995560   272.55407
std       1.117981   493.144478   140.73639
min       1.000000   

In [9]:
# Merge datasets to get transaction details with customer and product information
transactions_customers = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
transactions_data = pd.merge(transactions_customers, products_df, on='ProductID', how='left')


In [10]:
# Total expenditure for each customer
customer_spending = transactions_data.groupby('CustomerID')['TotalValue'].sum()

# Number of transactions for each customer
transaction_frequency = transactions_data.groupby('CustomerID')['TransactionID'].nunique()

# Recency of the last transaction for each customer
transactions_data['TransactionDate'] = pd.to_datetime(transactions_data['TransactionDate'])
last_transaction_date = transactions_data.groupby('CustomerID')['TransactionDate'].max()

# Get the most frequent product categories for each customer
product_categories = transactions_data.groupby('CustomerID')['Category'].apply(lambda x: x.mode()[0])

# Merge all features into a single dataframe
customer_features = pd.DataFrame({
    'TotalExpenditure': customer_spending,
    'TransactionFrequency': transaction_frequency,
    'LastTransactionDate': last_transaction_date,
    'ProductCategory': product_categories
}).reset_index()

# Fill missing values (if any) with default values like 0 for spending or 1 for frequency
customer_features = customer_features.fillna({'TotalExpenditure': 0, 'TransactionFrequency': 0, 'ProductCategory': 'Unknown'})


In [11]:
scaler = StandardScaler()
customer_features[['TotalExpenditure', 'TransactionFrequency']] = scaler.fit_transform(customer_features[['TotalExpenditure', 'TransactionFrequency']])


In [12]:
# Get the feature vectors
feature_matrix = customer_features[['TotalExpenditure', 'TransactionFrequency']].values

# Compute cosine similarity
cosine_sim = cosine_similarity(feature_matrix)

# Create a DataFrame to store similarity scores
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [13]:
lookalike_map = {}

for cust_id in customer_features['CustomerID'][:20]:
    similar_customers = cosine_sim_df[cust_id].sort_values(ascending=False)[1:4]
    lookalike_map[cust_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert the map to a DataFrame
lookalike_df = pd.DataFrame([(cust_id, similar_cust[0], similar_cust[1]) for cust_id, similar_customers in lookalike_map.items() for similar_cust in similar_customers], columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

lookalike_df.to_csv('Lookalike.csv', index=False)


In [15]:
lookalike_df.head()


Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0137,0.999567
1,C0001,C0152,0.997683
2,C0001,C0056,0.993947
3,C0002,C0029,0.999816
4,C0002,C0199,0.99949


In [16]:
# Save the lookalike recommendations DataFrame to a CSV file
lookalike_df.to_csv('/content/Lookalike.csv', index=False)

# Confirm the file is saved
print("Lookalike.csv file saved!")


Lookalike.csv file saved!


In [17]:
from google.colab import files

# Download the Lookalike.csv file
files.download('/content/Lookalike.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>