In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
# Provide the file paths for the datasets
customers_path = '/content/drive/MyDrive/Datasets/Customers.csv'
products_path = '/content/drive/MyDrive/Datasets/Products.csv'
transactions_path = '/content/drive/MyDrive/Datasets/Transactions.csv'

# Load datasets
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

# Display first few rows of each dataset
print("Customers Data:")
print(customers.head())

print("\nProducts Data:")
print(products.head())

print("\nTransactions Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [17]:
# Convert TransactionDate to datetime
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])
transactions["Year"] = transactions["TransactionDate"].dt.year

In [18]:
# Check data info
print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           -------

In [19]:
# Merge datasets
merged_data = transactions.merge(products, on="ProductID").merge(customers, on="CustomerID")

In [20]:
# Feature Extraction
customer_features = merged_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    preferred_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else None),
    region=("Region", "first")
).reset_index()

In [21]:
# Encode categorical features
customer_features_encoded = pd.get_dummies(customer_features, columns=["preferred_category", "region"], drop_first=True)

In [25]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["total_spent", "num_transactions", "avg_transaction_value"]
customer_features_encoded[numerical_features] = scaler.fit_transform(customer_features_encoded[numerical_features])

In [28]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features_encoded.drop("CustomerID", axis=1))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

In [26]:
# Find top 3 lookalikes for each customer
def get_top_lookalikes(similarity_row, top_n=3):
    similar_customers = similarity_row.sort_values(ascending=False).iloc[1:top_n + 1]
    return [(index, score) for index, score in similar_customers.items()]

In [29]:
lookalike_map = {
    cust_id: get_top_lookalikes(similarity_df.loc[cust_id])
    for cust_id in similarity_df.index
}

In [30]:
# Filter for the first 20 customers
lookalike_map_20 = {cust_id: lookalike_map[cust_id] for cust_id in similarity_df.index[:20]}


In [31]:
# Save to Lookalike.csv
lookalike_output = []
for cust_id, lookalikes in lookalike_map_20.items():
    for similar_cust, score in lookalikes:
        lookalike_output.append({"cust_id": cust_id, "similar_cust": similar_cust, "score": score})

lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv("Lookalike.csv", index=False)