In [17]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns


In [19]:
# Step 2: Load Datasets
try:
    customers = pd.read_csv('Customers.csv')
    transactions = pd.read_csv('Transactions.csv')
    products = pd.read_csv('Products.csv')
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise SystemExit("Required file not found. Please ensure all files are in the correct directory.")


In [23]:

# Step 3: Validate Columns in Datasets
required_columns_customers = {'CustomerID', 'Region'}
required_columns_transactions = {'CustomerID', 'ProductID', 'Quantity', 'TotalValue'}
required_columns_products = {'ProductID'}

def check_missing_columns(df, required_columns, file_name):
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        print(f"Error: Missing columns in {file_name}: {missing_columns}")
        print(f"Available columns in {file_name}: {df.columns.tolist()}")
        raise KeyError(f"{file_name} is missing required columns: {missing_columns}")

try:
    check_missing_columns(customers, required_columns_customers, 'Customers.csv')
    check_missing_columns(transactions, required_columns_transactions, 'Transactions.csv')
    check_missing_columns(products, required_columns_products, 'Products.csv')
except KeyError as e:
    print(f"Dataset Validation Failed: {e}")
    raise SystemExit("Please fix the datasets and re-run the script.")

In [25]:
# Step 4: Merge Datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Step 5: Feature Engineering
# Aggregate transaction data per customer
customer_features = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum'
}).reset_index()

# Merge customer demographics
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

In [27]:
# Step 6: Data Preprocessing
# Handle missing values
customer_features = customer_features.dropna()

# Encode categorical data (Region) if needed
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

# Step 7: Clustering
# Choose the number of clusters (e.g., 4)
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
customer_features['Cluster'] = kmeans.fit_predict(scaled_features)



In [29]:
# Step 8: Evaluate Clustering
# Calculate Davies-Bouldin Index and Silhouette Score
db_index = davies_bouldin_score(scaled_features, customer_features['Cluster'])
silhouette_avg = silhouette_score(scaled_features, customer_features['Cluster'])
print(f'Davies-Bouldin Index: {db_index}')
print(f'Silhouette Score: {silhouette_avg}')

Davies-Bouldin Index: 0.9590445461735108
Silhouette Score: 0.4454950051835499


In [13]:
# Step 2: Load Datasets
try:
    customers = pd.read_csv('Customers.csv')
    transactions = pd.read_csv('Transactions.csv')
    products = pd.read_csv('Products.csv')
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise SystemExit("Required file not found. Please ensure all files are in the correct directory.")

In [15]:
# Step 3: Validate Columns in Datasets
required_columns_customers = {'CustomerID', 'Age', 'Region'}
required_columns_transactions = {'CustomerID', 'ProductID', 'Quantity', 'TotalValue'}
required_columns_products = {'ProductID'}

def check_missing_columns(df, required_columns, file_name):
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        print(f"Error: Missing columns in {file_name}: {missing_columns}")
        print(f"Available columns in {file_name}: {df.columns.tolist()}")
        raise KeyError(f"{file_name} is missing required columns: {missing_columns}")

try:
    check_missing_columns(customers, required_columns_customers, 'Customers.csv')
    check_missing_columns(transactions, required_columns_transactions, 'Transactions.csv')
    check_missing_columns(products, required_columns_products, 'Products.csv')
except KeyError as e:
    raise SystemExit(f"Dataset Validation Failed: {e}")

Error: Missing columns in Customers.csv: {'Age'}
Available columns in Customers.csv: ['CustomerID', 'CustomerName', 'Region', 'SignupDate']


AttributeError: 'tuple' object has no attribute 'tb_frame'