# DMDW CASE STUDY

# Online User Behaviour Analysis Graphical Model 

### STEP 1. IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, plot_tree
import sqlite3
from hmmlearn import hmm
import networkx as nx
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BDeu

import warnings
warnings.filterwarnings('ignore')
print(" Libraries Imported Successfully!")


  from .autonotebook import tqdm as notebook_tqdm


 Libraries Imported Successfully!


### STEP 2 : LOAD DATASET

In [3]:
df = pd.read_csv('ecommerce_customer_data_custom_ratios.csv')
print('Rows,Cols:', df.shape)
print("Dataset Loaded Successfully ")
df.head()

Rows,Cols: (250000, 13)
Dataset Loaded Successfully 


Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn
0,46251,2020-09-08 09:38:32,Electronics,12,3,740,Credit Card,37,0.0,Christine Hernandez,37,Male,0
1,46251,2022-03-05 12:56:35,Home,468,4,2739,PayPal,37,0.0,Christine Hernandez,37,Male,0
2,46251,2022-05-23 18:18:01,Home,288,2,3196,PayPal,37,0.0,Christine Hernandez,37,Male,0
3,46251,2020-11-12 13:13:29,Clothing,196,1,3509,PayPal,37,0.0,Christine Hernandez,37,Male,0
4,13593,2020-11-27 17:55:11,Home,449,1,3452,Credit Card,49,0.0,James Grant,49,Female,1


###     STEP 3: DATA UNDERSTANDING


In [4]:
print("\n--- Data Understanding ---")
print("Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)



--- Data Understanding ---
Shape: (250000, 13)

Missing Values:
 Customer ID                  0
Purchase Date                0
Product Category             0
Product Price                0
Quantity                     0
Total Purchase Amount        0
Payment Method               0
Customer Age                 0
Returns                  47596
Customer Name                0
Age                          0
Gender                       0
Churn                        0
dtype: int64

Data Types:
 Customer ID                int64
Purchase Date             object
Product Category          object
Product Price              int64
Quantity                   int64
Total Purchase Amount      int64
Payment Method            object
Customer Age               int64
Returns                  float64
Customer Name             object
Age                        int64
Gender                    object
Churn                      int64
dtype: object


### STEP 4: DATA CLEANING & PRE-PROCESSING

In [5]:

print("\n--- Data Cleaning & Pre-processing ---")
df.drop_duplicates(inplace=True)

df['Returns'] = df['Returns'].fillna(0)
print(" Missing 'Returns' values filled with 0.")

df['Purchase Date'] = pd.to_datetime(df['Purchase Date'], errors='coerce')

df_full = df.copy()

cols_to_drop = ['Customer ID', 'Purchase Date', 'Customer Name', 'Age']
df = df.drop(columns=cols_to_drop)
print(f"Dropped columns for ML: {cols_to_drop}")

label_cols = df.select_dtypes(include=['object']).columns
le_dict = {}
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

print(" Data Cleaned and Encoded for ML")
print(df.head())


--- Data Cleaning & Pre-processing ---
 Missing 'Returns' values filled with 0.
Dropped columns for ML: ['Customer ID', 'Purchase Date', 'Customer Name', 'Age']
 Data Cleaned and Encoded for ML
   Product Category  Product Price  Quantity  Total Purchase Amount  \
0                 2             12         3                    740   
1                 3            468         4                   2739   
2                 3            288         2                   3196   
3                 1            196         1                   3509   
4                 3            449         1                   3452   

   Payment Method  Customer Age  Returns  Gender  Churn  
0               1            37      0.0       1      0  
1               3            37      0.0       1      0  
2               3            37      0.0       1      0  
3               3            37      0.0       1      0  
4               1            49      0.0       0      1  


### STEP 5: ENHANCED ETL & DATA WAREHOUSE CREATION (SQLite)

In [6]:
print("\n--- Creating Enhanced Data Warehouse with Time Dimension ---")
conn = None 
try:
    conn = sqlite3.connect('user_behavior.db')
    cursor = conn.cursor()

    # Create Time Dimension (CRITICAL for DW)
    df_full['Year'] = df_full['Purchase Date'].dt.year
    df_full['Month'] = df_full['Purchase Date'].dt.month
    df_full['Quarter'] = df_full['Purchase Date'].dt.quarter
    df_full['DayOfWeek'] = df_full['Purchase Date'].dt.day_name()
    
    dim_date = df_full[['Purchase Date', 'Year', 'Month', 'Quarter', 'DayOfWeek']].drop_duplicates()
    dim_date['DateID'] = range(1, len(dim_date) + 1)
    dim_date.to_sql('dim_date', conn, if_exists='replace', index=False)
    print(" Created 'dim_date' (Time Dimension)")

    # DimCustomer
    dim_customer_cols = ['Customer ID', 'Customer Name', 'Customer Age', 'Gender']
    dim_customer = df_full[dim_customer_cols].drop_duplicates(subset=['Customer ID']).reset_index(drop=True)
    dim_customer.to_sql('dim_customer', conn, if_exists='replace', index=False)
    print(" Created 'dim_customer'")

    # DimProduct
    dim_product = df_full[['Product Category']].drop_duplicates().reset_index(drop=True)
    dim_product['ProductID'] = dim_product.index + 1 
    dim_product = dim_product[['ProductID', 'Product Category']]
    dim_product.to_sql('dim_product', conn, if_exists='replace', index=False)
    print("Created 'dim_product'")
    
    # Create Enhanced Fact Table with DateID
    fact_behavior = pd.merge(df_full, dim_product, on='Product Category')
    fact_behavior = pd.merge(fact_behavior, dim_date[['Purchase Date', 'DateID']], on='Purchase Date')
    
    fact_columns = [
        'Customer ID', 'ProductID', 'DateID', 
        'Product Price', 'Quantity', 'Total Purchase Amount', 'Payment Method',
        'Returns', 'Churn'
    ]
    fact_behavior = fact_behavior[fact_columns]
    fact_behavior.to_sql('fact_behavior', conn, if_exists='replace', index=False)
    print(" Created 'fact_behavior' with Time Dimension")

    conn.commit()
    print(" Enhanced Data Warehouse 'user_behavior.db' created successfully.")

except KeyError as e:
    print(f" Error creating DW: A column {e} was not found.")
except Exception as e:
    print(f" Error creating DW: {e}")
finally:
    if conn:
        conn.close()


--- Creating Enhanced Data Warehouse with Time Dimension ---
 Created 'dim_date' (Time Dimension)
 Created 'dim_customer'
Created 'dim_product'
 Created 'fact_behavior' with Time Dimension
 Enhanced Data Warehouse 'user_behavior.db' created successfully.


### STEP 6: COMPREHENSIVE OLAP QUERIES

In [7]:
print("\n--- Running Multiple OLAP Operations ---")
conn = None
try:
    conn = sqlite3.connect('user_behavior.db')
    
    # OLAP 1: Basic Aggregation with Grouping
    print("\n OLAP Query 1: Revenue by Category and Gender")
    olap_query1 = """
    SELECT
        p."Product Category",
        c.Gender,
        SUM(f."Total Purchase Amount") AS TotalSpend,
        COUNT(*) AS TotalOrders,
        AVG(f."Total Purchase Amount") AS AvgOrderValue
    FROM fact_behavior f
    JOIN dim_customer c ON f."Customer ID" = c."Customer ID"
    JOIN dim_product p ON f.ProductID = p.ProductID
    GROUP BY p."Product Category", c.Gender
    ORDER BY TotalSpend DESC
    LIMIT 10;
    """
    olap_result1 = pd.read_sql_query(olap_query1, conn)
    print(olap_result1)

    # OLAP 2: Time-based Analysis (Drill-Down)
    print("\n OLAP Query 2: Quarterly Revenue Trend (DRILL-DOWN)")
    olap_query2 = """
    SELECT
        d.Year,
        d.Quarter,
        SUM(f."Total Purchase Amount") AS QuarterlyRevenue,
        COUNT(*) AS TotalTransactions
    FROM fact_behavior f
    JOIN dim_date d ON f.DateID = d.DateID
    GROUP BY d.Year, d.Quarter
    ORDER BY d.Year, d.Quarter;
    """
    olap_result2 = pd.read_sql_query(olap_query2, conn)
    print(olap_result2)

    # OLAP 3: Slice Operation
    print("\n OLAP Query 3: Churn Analysis by Product Category (SLICE)")
    olap_query3 = """
    SELECT
        p."Product Category",
        SUM(CASE WHEN f.Churn = 1 THEN 1 ELSE 0 END) AS ChurnedCustomers,
        COUNT(*) AS TotalCustomers,
        ROUND(100.0 * SUM(CASE WHEN f.Churn = 1 THEN 1 ELSE 0 END) / COUNT(*), 2) AS ChurnRate
    FROM fact_behavior f
    JOIN dim_product p ON f.ProductID = p.ProductID
    GROUP BY p."Product Category"
    ORDER BY ChurnRate DESC;
    """
    olap_result3 = pd.read_sql_query(olap_query3, conn)
    print(olap_result3)

    # OLAP 4: Pivot-style Query
    print("\n OLAP Query 4: Payment Method Distribution by Quarter (PIVOT)")
    olap_query4 = """
    SELECT
        f."Payment Method",
        SUM(CASE WHEN d.Quarter = 1 THEN f."Total Purchase Amount" ELSE 0 END) AS Q1_Revenue,
        SUM(CASE WHEN d.Quarter = 2 THEN f."Total Purchase Amount" ELSE 0 END) AS Q2_Revenue,
        SUM(CASE WHEN d.Quarter = 3 THEN f."Total Purchase Amount" ELSE 0 END) AS Q3_Revenue,
        SUM(CASE WHEN d.Quarter = 4 THEN f."Total Purchase Amount" ELSE 0 END) AS Q4_Revenue
    FROM fact_behavior f
    JOIN dim_date d ON f.DateID = d.DateID
    GROUP BY f."Payment Method";
    """
    olap_result4 = pd.read_sql_query(olap_query4, conn)
    print(olap_result4)

except Exception as e:
    print(f" Error running OLAP queries: {e}")
finally:
    if conn:
        conn.close()



--- Running Multiple OLAP Operations ---

 OLAP Query 1: Revenue by Category and Gender
  Product Category  Gender  TotalSpend  TotalOrders  AvgOrderValue
0         Clothing  Female   103723243        37946    2733.443393
1            Books  Female   102518477        37473    2735.795826
2            Books    Male   102421124        37439    2735.680013
3         Clothing    Male   100809162        37106    2716.788713
4      Electronics    Male    68372213        25057    2728.667159
5      Electronics  Female    68227254        25128    2715.188395
6             Home  Female    67993447        25013    2718.324351
7             Home    Male    67277763        24838    2708.662654

 OLAP Query 2: Quarterly Revenue Trend (DRILL-DOWN)
    Year  Quarter  QuarterlyRevenue  TotalTransactions
0   2020        1          45541363              16764
1   2020        2          45224012              16772
2   2020        3          46928563              17211
3   2020        4          47582236

### STEP 7: HMM SESSIONIZATION & MODELING

In [8]:
print("\n--- Starting HMM Sessionization ---")
try:
    le_hmm = LabelEncoder()
    df_full['Category_Code'] = le_hmm.fit_transform(df_full['Product Category'])

    df_full.sort_values(by=['Customer ID', 'Purchase Date'], inplace=True)

    sequences = df_full.groupby('Customer ID')['Category_Code'].apply(list)
    sequences = sequences[sequences.apply(len) > 0]
    lengths = [len(s) for s in sequences]
    
    X_hmm = np.concatenate(sequences.values).reshape(-1, 1)

    print(f" Prepared {len(lengths)} sequences for HMM.")

    n_hidden_states = 4
    hmm_model = hmm.MultinomialHMM(n_components=n_hidden_states, random_state=42, n_iter=100)
    print("--- Training HMM ---")
    hmm_model.fit(X_hmm, lengths)
    print(" HMM Model Trained.")

    plt.figure(figsize=(7, 5))
    sns.heatmap(hmm_model.transmat_, annot=True, cmap='viridis', fmt='.3f')
    plt.title(f'HMM Hidden State Transition Matrix (k={n_hidden_states})')
    plt.xlabel('To State')
    plt.ylabel('From State')
    plt.savefig("plot_hmm_transition_matrix.png")
    print(" Saved 'plot_hmm_transition_matrix.png'")
    plt.clf()

except Exception as e:
    print(f" Error during HMM Step: {e}")
    print("--- Skipping HMM and continuing script ---")




--- Starting HMM Sessionization ---


https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


 Prepared 49673 sequences for HMM.
--- Training HMM ---
 HMM Model Trained.
 Saved 'plot_hmm_transition_matrix.png'


<Figure size 700x500 with 0 Axes>

### STEP 8: BAYESIAN NETWORK (Structure Learning) 

In [9]:

print("\n--- Learning Bayesian Network Structure ---")
try:
    bn_cols = ['Total Purchase Amount', 'Quantity', 'Returns', 'Payment Method', 'Customer Age', 'Churn']
    df_bn = df[bn_cols].copy() 

    # Discretize continuous variables for pgmpy
    print("--- Discretizing continuous variables ---")
    df_bn['Total Purchase Amount'] = pd.cut(df_bn['Total Purchase Amount'], 
                                             bins=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
    df_bn['Quantity'] = pd.cut(df_bn['Quantity'], bins=3, labels=['Low', 'Medium', 'High'])
    df_bn['Customer Age'] = pd.cut(df_bn['Customer Age'], bins=4, labels=['Young', 'Adult', 'MiddleAged', 'Senior'])
    df_bn['Returns'] = pd.cut(df_bn['Returns'], bins=3, labels=['NoReturns', 'FewReturns', 'ManyReturns'])

    for col in df_bn.columns:
        df_bn[col] = df_bn[col].astype('category')

    print(f" Discretized data for Bayesian Network")

    print("--- Starting Hill Climb Search ---")
    hc = HillClimbSearch(df_bn)
    scoring_method = BDeu(data=df_bn, equivalent_sample_size=50)
    best_model_structure = hc.estimate(scoring_method=scoring_method, max_iter=1000)
    
    if len(best_model_structure.edges()) == 0:
        print(" No edges found. Creating manual structure.")
        bn_model = DiscreteBayesianNetwork([
            ('Payment Method', 'Churn'),
            ('Total Purchase Amount', 'Churn'),
            ('Customer Age', 'Churn'),
            ('Quantity', 'Returns'),
            ('Returns', 'Churn')
        ])
    else:
        bn_model = DiscreteBayesianNetwork(best_model_structure.edges())

    print(f" Bayesian Network Created. Edges: {list(bn_model.edges())}")

    if len(bn_model.edges()) > 0:
        plt.figure(figsize=(12, 8))
        G = nx.DiGraph()
        G.add_edges_from(bn_model.edges())
        pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
        
        nx.draw_networkx_nodes(G, pos, node_size=3000, node_color='lightblue', 
                               alpha=0.9, linewidths=2, edgecolors='navy')
        nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
        nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, 
                               arrowsize=20, arrowstyle='->', width=2,
                               connectionstyle='arc3,rad=0.1')
        
        plt.title("Learned Bayesian Network Structure", fontsize=16, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig("plot_bayesian_network_graph.png", dpi=300, bbox_inches='tight')
        print(" Saved 'plot_bayesian_network_graph.png'")
        plt.clf()
        
        print("\n Bayesian Network Edges:")
        for edge in bn_model.edges():
            print(f"   {edge[0]} → {edge[1]}")
    else:
        print(" No edges to visualize")

except Exception as e:
    print(f" Error during Bayesian Network: {e}")
    import traceback
    traceback.print_exc()


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'Total Purchase Amount': 'O', 'Quantity': 'O', 'Returns': 'O', 'Payment Method': 'C', 'Customer Age': 'O', 'Churn': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'Total Purchase Amount': 'O', 'Quantity': 'O', 'Returns': 'O', 'Payment Method': 'C', 'Customer Age': 'O', 'Churn': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'Total Purchase Amount': 'O', 'Quantity': 'O', 'Returns': 'O', 'Payment Method': 'C', 'Customer Age': 'O', 'Churn': 'C'}



--- Learning Bayesian Network Structure ---
--- Discretizing continuous variables ---
 Discretized data for Bayesian Network
--- Starting Hill Climb Search ---


  0%|          | 1/1000 [00:00<06:51,  2.43it/s]


 Bayesian Network Created. Edges: [('Customer Age', 'Total Purchase Amount')]
 Saved 'plot_bayesian_network_graph.png'

 Bayesian Network Edges:
   Customer Age → Total Purchase Amount


<Figure size 1200x800 with 0 Axes>

### STEP 9: EXPLORATORY DATA ANALYSIS

In [10]:
print("\n--- Starting Visualization ---")

df.hist(figsize=(12, 8), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numeric Features", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("plot_histograms.png")
print(" Saved 'plot_histograms.png'")
plt.clf()

plt.figure(figsize=(12, 8)) 
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".1f", annot_kws={"size": 8})
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig("plot_heatmap.png")
print(" Saved 'plot_heatmap.png'")
plt.clf()

if 'Churn' in df.columns:
    plt.figure(figsize=(5, 4))
    df['Churn'].value_counts().plot(kind='bar', color=['lightgreen', 'salmon'])
    plt.title("Target Class Distribution (Churn)")
    plt.xlabel("0 = Stay | 1 = Churn")
    plt.ylabel("Count")
    plt.savefig("plot_churn_distribution.png")
    print(" Saved 'plot_churn_distribution.png'")
    plt.clf()



--- Starting Visualization ---
 Saved 'plot_histograms.png'
 Saved 'plot_heatmap.png'
 Saved 'plot_churn_distribution.png'


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 500x400 with 0 Axes>

### STEP 10: FEATURE SELECTION & SCALING

In [11]:
X = df.drop('Churn', axis=1)
y = df['Churn']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) 
print("\n Features Selected and Scaled")


 Features Selected and Scaled


### STEP 11: K-MEANS CLUSTERING

In [12]:
print("\n--- Finding Optimal K for K-Means ---")
inertia = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.savefig("plot_kmeans_elbow.png")
print(" Saved 'plot_kmeans_elbow.png'")
plt.clf()


--- Finding Optimal K for K-Means ---
 Saved 'plot_kmeans_elbow.png'


<Figure size 800x500 with 0 Axes>

### STEP 12: K-MEANS SEGMENTATION

In [13]:
k_optimal = 4
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
kmeans.fit(X_scaled)
df['Cluster'] = kmeans.labels_
print(f"\n K-Means Clustering Complete. {k_optimal} clusters created.")

plt.figure(figsize=(8, 5))
sns.countplot(x='Cluster', hue='Churn', data=df, palette=['lightgreen', 'salmon'])
plt.title('Churn Distribution Within Each Customer Cluster')
plt.xlabel('Customer Cluster')
plt.ylabel('Count')
plt.legend(title='Churn', labels=['0 = Stay', '1 = Churn'])
plt.savefig("plot_cluster_churn_distribution.png")
print(" Saved 'plot_cluster_churn_distribution.png'")
plt.clf()

INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.



 K-Means Clustering Complete. 4 clusters created.


INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


 Saved 'plot_cluster_churn_distribution.png'


<Figure size 800x500 with 0 Axes>

### STEP 13: TRAIN-TEST SPLIT

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("\nTraining Data:", X_train.shape)
print("Testing Data:", X_test.shape)


Training Data: (200000, 8)
Testing Data: (50000, 8)


### STEP 14: MODEL TRAINING (Random Forest + Decision Tree)

In [15]:
# Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("\n--- Training Random Forest ---")
model.fit(X_train, y_train)
print(" Random Forest Training Complete")

# Decision Tree
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
print("--- Training Decision Tree ---")
dt_model.fit(X_train, y_train)
print(" Decision Tree Training Complete")


--- Training Random Forest ---
 Random Forest Training Complete
--- Training Decision Tree ---
 Decision Tree Training Complete


### STEP 15: MODEL EVALUATION

In [16]:
# Random Forest Evaluation
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n Random Forest Evaluation:")
print("Accuracy:", round(acc * 100, 2), "%")
print("F1 Score:", round(f1, 3))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("plot_confusion_matrix_rf.png")
print(" Saved 'plot_confusion_matrix_rf.png'")
plt.clf()

# Decision Tree Evaluation
y_pred_dt = dt_model.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("\n Decision Tree Accuracy:", round(acc_dt * 100, 2), "%")

plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("plot_confusion_matrix_dt.png")
print(" Saved 'plot_confusion_matrix_dt.png'")
plt.clf()


 Random Forest Evaluation:
Accuracy: 79.35 %
F1 Score: 0.712

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.99      0.88     40016
           1       0.20      0.01      0.02      9984

    accuracy                           0.79     50000
   macro avg       0.50      0.50      0.45     50000
weighted avg       0.68      0.79      0.71     50000

 Saved 'plot_confusion_matrix_rf.png'

 Decision Tree Accuracy: 80.03 %
 Saved 'plot_confusion_matrix_dt.png'


<Figure size 500x400 with 0 Axes>

<Figure size 500x400 with 0 Axes>

### STEP 16: FEATURE IMPORTANCE

In [None]:

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns 

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feature_names[indices], palette='viridis')
plt.title("Feature Importance (Random Forest)")
plt.savefig("plot_feature_importance.png")
print("Saved 'plot_feature_importance.png'")
plt.clf()

# Decision Tree Visualization
plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=X.columns, class_names=['Stay', 'Churn'], 
            filled=True, rounded=True, fontsize=10)
plt.title('Decision Tree for Churn Prediction')
plt.savefig('plot_decision_tree.png', dpi=300, bbox_inches='tight')
print(" Saved 'plot_decision_tree.png'")
plt.clf()

print("\n --- Script Finished Successfully ---")
print("\n Summary of Outputs:")
print("  - Data Warehouse: user_behavior.db (with Time Dimension)")
print("  - OLAP Queries: 4 comprehensive queries executed")
print("  - HMM Model: Trained and transition matrix saved")
print("  - Bayesian Network: Structure learned and visualized")
print("  - Clustering: K-Means with 4 clusters")
print("  - Classification: Random Forest + Decision Tree")
print("  - Visualizations: 11 plots saved")

Saved 'plot_feature_importance.png'
 Saved 'plot_decision_tree.png'

 --- Script Finished Successfully ---

 Summary of Outputs:
  - Data Warehouse: user_behavior.db (with Time Dimension)
  - OLAP Queries: 4 comprehensive queries executed
  - HMM Model: Trained and transition matrix saved
  - Bayesian Network: Structure learned and visualized
  - Clustering: K-Means with 4 clusters
  - Classification: Random Forest + Decision Tree
  - Visualizations: 11 plots saved


<Figure size 1000x600 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>