In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [26]:
# 1. Load the Behavioral Data (Only the second file)
print("Loading Behavioral data...")
try:
    behavioral_data = pd.read_excel(r"D:\Docs\EV Project\B-smev_data(EV 2W FY 21-22).xlsx")
    print(f"Behavioral data loaded successfully with {behavioral_data.shape[0]} rows and {behavioral_data.shape[1]} columns.")
except Exception as e:
    print(f"Error loading behavioral data: {e}")
    behavioral_data = pd.DataFrame()  # Initialize an empty DataFrame if loading fails

    print("Behavioral data is empty, skipping clustering.")


Loading Behavioral data...


In [None]:

# 2. Data Preprocessing
def preprocess_data(df):
    if df.empty:
        print("Warning: Empty dataframe received for preprocessing.")
        return df

    df = df.dropna()  # Drop missing values
    df = df.select_dtypes(include=[np.number])  # Keep numeric data only

    if df.empty:
        print("Warning: No numeric data left after dropping missing values.")
        return df

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    return pd.DataFrame(scaled_data, columns=df.columns)

if not behavioral_data.empty:
    behavioral_data = preprocess_data(behavioral_data)
else:
    print("Behavioral data is empty, skipping preprocessing.")


In [None]:

# 3. Correlation Heatmap
def visualize_data(df, title):
    if df.empty:
        print(f"Warning: No data to visualize for {title}.")
        return
    
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title(f"Correlation Heatmap: {title}")
    plt.show()

if not behavioral_data.empty:
    visualize_data(behavioral_data, "Behavioral Data")

# 4. Clustering for Customer Segmentation
def perform_clustering(df, n_clusters, title):
    if df.empty:
        print(f"Warning: {title} data is empty. Skipping clustering.")
        return []

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(df)
    
    # Reduce dimensions using PCA for visualization
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(df)
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis')
    plt.title(f"Customer Segmentation: {title}")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.show()

    return clusters

n_clusters = 4
if not behavioral_data.empty:
    perform_clustering(behavioral_data, n_clusters, "Behavioral Data")
else: