<a href="https://colab.research.google.com/github/BianchiLuca28/FHNW-BI-LMS/blob/main/notebooks/notebook1_luca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from scipy import stats
from imblearn.over_sampling import SMOTE

# Importing dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
folder_path = "/content/drive/MyDrive/__Shared/BI"

In [None]:
df = pd.read_csv(folder_path + "/preprocessed_flattened_dataset.csv")
df.head()

# Exploratory Data Analysis (EDA)

### Basic Data Exploration

In [None]:
# Basic Data Overview
print("Dataset Overview:")
display(df.head())

# Summary statistics for numerical features
print("\nSummary Statistics:")
display(df.describe())

# Data types and missing value counts
print("\nData Types and Missing Values:")
print(df.dtypes)
print("\nMissing Values Count:")
print(df.isnull().sum())

### Missing Values Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap to visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title("Missing Values Heatmap")
plt.show()

### Correlation Heatmap for Numerical Features

In [None]:
# Select only numerical columns
numerical_features = df.select_dtypes(include=['int64', 'float64'])

# Correlation heatmap for numerical features
plt.figure(figsize=(15, 10))
corr_matrix = numerical_features.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

### Visualizing Feature Distributions

In [None]:
# Plot distribution for each numerical feature
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[feature], kde=True)
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.show()

### Box Plots for Outlier Detection

In [None]:
# Box plot for outlier detection in each numerical feature
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[feature])
    plt.title(f"Box Plot for {feature} (Outlier Detection)")
    plt.xlabel(feature)
    plt.show()

### Feature Interaction Plots

In [None]:
# Pair plot for numerical features (downsample to avoid overload if data is large)
sampled_data = df.sample(5000)  # Sample a subset to avoid overloading the plot
sns.pairplot(sampled_data[numerical_features])
plt.show()

### Target Variable Analysis

In [None]:
# Plotting the class distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(df['service_type'])
plt.title("Class Distribution of Service Type")
plt.xlabel("Service Type")
plt.ylabel("Count")
plt.show()

#### Special class of the service type

In [None]:
# Filter out the 'special' class cases
special_cases = df[df['service_type'] == 'Special']

In [None]:
special_cases

In [None]:
# Plot distribution for each numerical feature
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

# Display the special cases
print("Special Cases Analysis:")
print(special_cases.describe())  # General statistics for numerical features
print(special_cases.head())  # View the first few rows of the special cases

# Plot some of the key features for visual analysis
plt.figure(figsize=(10, 5))
sns.boxplot(data=special_cases[numerical_features])
plt.title('Boxplot of Numerical Features for Special Service Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Compare the customer price of 'special' cases with all other classes
plt.figure(figsize=(10, 6))
sns.boxplot(x='service_type', y='customer_price', data=df)
plt.title('Comparison of Customer Price Across Service Types')
plt.xlabel('Service Type')
plt.ylabel('Customer Price')
plt.show()

In [None]:
# Create a new dataframe with derived margin
df_margin_analysis = df[['service_type', 'customer_price', 'final_carrier_price', 'expected_carrier_price']]

# Derive margin estimate as customer_price - final_carrier_price or expected_carrier_price if final_carrier_price is not available
df_margin_analysis['derived_margin'] = df_margin_analysis['customer_price'] - df_margin_analysis['final_carrier_price'].fillna(df_margin_analysis['expected_carrier_price'])

In [None]:
# Compare the derived margin of 'special' cases with all other classes
plt.figure(figsize=(10, 6))
sns.boxplot(x='service_type', y='derived_margin', data=df_margin_analysis)
plt.title('Comparison of Derived Margin Across Service Types')
plt.xlabel('Service Type')
plt.ylabel('Derived Margin')
plt.show()

### Feature Importance Analysis (initial)

This code crashes since it performes "get_dummies". Change it as the next one

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Downsample the data to avoid memory overload
df_sampled = df.sample(frac=0.1, random_state=42)  # Use only 10% of the data

# Separate the target column before any transformations
target_column = 'service_type'
y = df_sampled[target_column]

# Drop the target column from the feature set
X = df_sampled.drop(columns=[target_column])

# Split numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Handle categorical columns
for col in categorical_cols:
    if X[col].nunique() > 10:  # High cardinality threshold (e.g., > 10 unique values)
        # Apply Label Encoding for high-cardinality features
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    else:
        # Apply One-Hot Encoding for low-cardinality features
        X = pd.get_dummies(X, columns=[col], drop_first=True)

# Encode categorical target column if not already encoded
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)

# Train a RandomForestClassifier for feature importance analysis
rf = RandomForestClassifier(n_estimators=20, random_state=42)  # Reduced number of estimators to limit memory usage
rf.fit(X, y)

# Plot feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)[:10]
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=importances.index)
plt.title("Top 10 Feature Importances (Initial Analysis)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

### Correlation with Target Variable

In [None]:
# Box plots to see how numerical features relate to the target variable
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df['service_type'], y=df[feature])
    plt.title(f"Relationship between {feature} and Service Type")
    plt.xlabel("Service Type")
    plt.ylabel(feature)
    plt.show()

# Preprocessing

## Removing unavailable columns at the moment of the prediction

In [None]:
columns_to_exclude = [
    'year_delivery_date', 'month_delivery_date',
    'quarter_delivery_date', 'year_real_delivery_date',
    'month_real_delivery_date', 'quarter_real_delivery_date',
    'final_carrier_price',
    'margin', 'lms_plus',
    'year_pickup_date', 'month_pickup_date', 'quarter_pickup_date',
    'year_real_pickup_date', 'month_real_pickup_date', 'quarter_real_pickup_date',
    'domain_name_service',
    'domain_name_customer',
    'transport_type', 'shipment_id',
    'name_service'
]

In [None]:
# Drop the identified columns to prevent label leakage
df = df.drop(columns=columns_to_exclude, axis=1)

## Handling missing values

In [None]:
# columns with NAs (with more than 0)
df.isna().sum()[df.isna().sum() > 0]

In [None]:
# Drop columns with too many missing values (more than 70% missing)
threshold = len(df) * 0.7
df = df.dropna(axis=1, thresh=threshold)

# Fill numeric columns with median and categorical columns with 'missing'
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].fillna('missing')

## Label Encoding

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Encode Categorical Target
target_column = 'service_type'
le_target = LabelEncoder()
df[target_column] = le_target.fit_transform(df[target_column])

In [None]:
# Split Features and Target
X = df.drop([target_column], axis=1)
y = df[target_column]

In [None]:
# Encoding Strategy for Features
# Split categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Apply Encoding to Categorical Features
# Label Encoding for high-cardinality features, One-Hot Encoding for low-cardinality features
for col in categorical_cols:
    if X[col].nunique() > 10:  # High cardinality threshold (e.g., >10 unique values)
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    else:
        X = pd.get_dummies(X, columns=[col], drop_first=True)

## Train Test Split, Outliers, SMOTE, and Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from scipy import stats
import numpy as np

def custom_train_test_preprocess(
    X, y, special_class="Special", max_synthetic=200, test_size=0.2,
    threshold=3, random_state=42
):
    """
    Custom function to preprocess data:
    1. Train-test split ensuring at least 3 cases of the special class in the test set.
    2. Remove outliers from numerical features in the training set (excluding the special class).
    3. Apply SMOTE to the training set only for the special class.
    4. Scale numerical features in both training and test sets.

    Parameters:
        X (pd.DataFrame): Features dataset.
        y (pd.Series): Target variable.
        special_class (str): The class for which SMOTE is applied.
        max_synthetic (int): Maximum number of synthetic samples for the special class.
        test_size (float): Proportion of the dataset to include in the test split.
        threshold (float): Z-score threshold for outlier detection.
        random_state (int): Random state for reproducibility.

    Returns:
        X_train_resampled, X_test_scaled, y_train_resampled, y_test: Preprocessed data.
    """
    # Step 1: Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Ensure at least 3 samples of the special class in the test set
    special_test_count = sum(y_test == special_class)
    if special_test_count < 3:
        special_indices = y[y == special_class].index
        remaining_needed = 3 - special_test_count

        # Add required samples from training to test set
        additional_test_indices = X_train.loc[special_indices].index[:remaining_needed]
        X_test = pd.concat([X_test, X_train.loc[additional_test_indices]], axis=0)
        y_test = pd.concat([y_test, y_train.loc[additional_test_indices]], axis=0)

        # Remove these samples from the training set
        X_train = X_train.drop(additional_test_indices)
        y_train = y_train.drop(additional_test_indices)

    # Step 2: Remove outliers from non-special cases in training set
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    special_mask = y_train == special_class
    X_train_special = X_train[special_mask]
    X_train_non_special = X_train[~special_mask]
    y_train_special = y_train[special_mask]
    y_train_non_special = y_train[~special_mask]

    z_scores = np.abs(stats.zscore(X_train_non_special[numerical_cols]))
    outlier_filter = (z_scores < threshold).all(axis=1)
    X_train_non_special = X_train_non_special[outlier_filter]
    y_train_non_special = y_train_non_special[outlier_filter]

    # Combine special and non-special data back together
    X_train = pd.concat([X_train_special, X_train_non_special], axis=0)
    y_train = pd.concat([y_train_special, y_train_non_special], axis=0)

    # Step 3: Apply SMOTE to the training set for the special class
    smote = SMOTE(sampling_strategy={special_class: min(max_synthetic, 200)}, random_state=random_state)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Step 4: Scale numerical features
    numerical_cols_train = X_train.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols_test = X_test.select_dtypes(include=['int64', 'float64']).columns

    scaler = StandardScaler()
    X_train_resampled[numerical_cols_train] = scaler.fit_transform(X_train_resampled[numerical_cols_train])
    X_test[numerical_cols_test] = scaler.transform(X_test[numerical_cols_test])

    return X_train_resampled, X_test, y_train_resampled, y_test

In [None]:
X_train, X_test, y_train, y_test = custom_train_test_preprocess(
    X, y, special_class="Special", max_synthetic=200, test_size=0.2, random_state=42
)

## Feature selection

In [None]:
# Random Forest Feature Selection
# Train a RandomForestClassifier to determine feature importance
rf = RandomForestClassifier(n_estimators=50, random_state=42)  # Reduced number of estimators for simplicity
rf.fit(X_train, y_train)

# Get Feature Importances and Select Important Features
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
important_features = feature_importances[feature_importances > 0.01].index.tolist()  # Adjust threshold as needed

# Create Final Dataset with Selected Features
X_selected_train = X_train[important_features]
X_selected_test = X_test[important_features]

# Output the Selected Features
print("Selected Features:", important_features)

# Model Training & Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# List of models to train
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced', multi_class='multinomial'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=1)  # scale_pos_weight used to handle imbalance
}

In [None]:
# Prepare training and test data
X_train, X_test = X_selected_train, X_selected_test
y_train, y_test = y_train, y_test

In [None]:
# Initialize a dictionary to store the results
results = {}

# Train each model in a loop and evaluate its performance
for model_name, model in models.items():
    # Cross-Validation for Robust Evaluation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Predict probabilities if possible
    if hasattr(model, "predict_proba"):
        y_pred_prob = model.predict_proba(X_test)
        # Check if it's a binary or multiclass problem
        if y_pred_prob.shape[1] == 2:  # Binary classification
            y_pred_prob = y_pred_prob[:, 1]
        else:  # Multiclass, calculate ROC AUC for all classes
            y_pred_prob = y_pred_prob
    else:
        y_pred_prob = None

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)

    # Calculate ROC AUC if possible
    if y_pred_prob is not None and len(set(y_test)) == 2:
        # Binary ROC AUC
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic for {model_name}')
        plt.legend(loc='lower right')
        plt.show()
    elif y_pred_prob is not None and len(set(y_test)) > 2:
        # Multiclass ROC AUC
        roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
        print(f"Multiclass ROC AUC: {roc_auc:.4f}")
    else:
        roc_auc = None

    # Store the results for comparison
    results[model_name] = {
        "Accuracy": accuracy,
        "Cross-Validation Accuracy": cv_scores.mean(),
        "Cross-Validation Std Dev": cv_scores.std(),
        "Classification Report": class_report,
        "Confusion Matrix": confusion,
        "ROC AUC": roc_auc
    }

    # Print the performance metrics for the model
    print(f"\nModel: {model_name}")
    print(f"Test Set Accuracy: {accuracy:.4f}")
    if roc_auc is not None:
        print(f"ROC AUC: {roc_auc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion)
    print("\n" + "="*60 + "\n")

In [None]:
# Compare results across models
# Convert the metrics to a DataFrame for easier visualization
comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append([
        model_name,
        metrics["Accuracy"],
        metrics["Cross-Validation Accuracy"],
        metrics["Cross-Validation Std Dev"],
        metrics["ROC AUC"]
    ])

comparison_df = pd.DataFrame(comparison_data, columns=["Model", "Accuracy", "Cross-Validation Accuracy", "Cross-Validation Std Dev", "ROC AUC"])

print("Comparison of Model Performance:")
print(comparison_df)

In [None]:
# Compare results across models
# Convert the metrics to a DataFrame for easier visualization
comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append([
        model_name,
        metrics["Accuracy"],
        metrics["Cross-Validation Accuracy"],
        metrics["Cross-Validation Std Dev"],
        metrics["ROC AUC"]
    ])

comparison_df = pd.DataFrame(comparison_data, columns=["Model", "Accuracy", "Cross-Validation Accuracy", "Cross-Validation Std Dev", "ROC AUC"])

print("Comparison of Model Performance:")
print(comparison_df)

## Hyperparameter Tuning for XGBoost

In [None]:
# from sklearn.model_selection import GridSearchCV
# import numpy as np
# import xgboost as xgb

# # Define the XGBoost model
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=1)

# # Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }

# # Use GridSearchCV to search for the best combination of hyperparameters
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# # Fit GridSearchCV on the training data
# grid_search.fit(X_selected_train, y_train)

# # Get the best parameters and best score from the grid search
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters from Grid Search:", best_params)
# print("Best Cross-Validation Accuracy from Grid Search:", best_score)

In [None]:
# # Train the XGBoost model using the best parameters
# best_xgb_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=1)
# best_xgb_model.fit(X_selected_train, y_train)

# # Make predictions on the test set
# y_pred_best = best_xgb_model.predict(X_selected_test)

# # Evaluate the tuned model's performance
# accuracy_best = accuracy_score(y_test, y_pred_best)
# classification_report_best = classification_report(y_test, y_pred_best)
# confusion_matrix_best = confusion_matrix(y_test, y_pred_best)

# print(f"Test Set Accuracy (Tuned XGBoost): {accuracy_best:.4f}")
# print("Classification Report (Tuned XGBoost):")
# print(classification_report_best)
# print("Confusion Matrix (Tuned XGBoost):")
# print(confusion_matrix_best)

# Analysis of Results

### Summary of Business Impact

The Random Forest model provides key insights for optimizing delivery services:
1. **Feature Importance**: The top features driving the predictions indicate which factors most impact the service type, providing insights into areas that can be optimized.
2. **Cost Matrix**: By analyzing the cost associated with false positives and false negatives, the model helps in understanding the financial implications of incorrect predictions. For example, reducing false positives might lower operational costs.
3. **Operational Efficiency**: By accurately predicting the service type, resources can be allocated more effectively, improving overall efficiency.
4. **Customer Targeting**: The insights from feature importance can help target specific customer segments with tailored services, increasing satisfaction and loyalty.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Train the final Random Forest model
final_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
final_rf.fit(X_selected_train, y_train)

# Make predictions on the test set
y_pred = final_rf.predict(X_selected_test)
y_pred_prob = final_rf.predict_proba(X_selected_test)

In [None]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
confusion = confusion_matrix(y_test, y_pred)

In [None]:
# Calculate ROC AUC for multiclass
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')

# Plot ROC Curve (for each class)
plt.figure(figsize=(10, 6))
for i in range(len(final_rf.classes_)):
    fpr, tpr, _ = roc_curve(y_test == i, y_pred_prob[:, i])
    plt.plot(fpr, tpr, lw=2, label=f'Class {i} (area = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Random Forest (Multiclass)')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', xticklabels=final_rf.classes_, yticklabels=final_rf.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Random Forest')
plt.show()

In [None]:
# Cost Matrix Analysis
# Define a cost matrix (example values, can be adjusted based on business context)
cost_matrix = np.array([[0, 10], [5, 0]])  # [[TN cost, FP cost], [FN cost, TP cost]]

# Calculate cost based on confusion matrix for multiclass
cost = 0
for i in range(len(confusion)):
    for j in range(len(confusion)):
        cost += confusion[i, j] * cost_matrix[min(i, 1), min(j, 1)]
print(f"Total Cost Based on Cost Matrix: {cost}")

In [None]:
# Visualize Cost Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cost_matrix, annot=True, cmap='Reds', fmt='g', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Cost Matrix Visualization')
plt.show()

In [None]:
# Feature Importance Analysis
importances = pd.Series(final_rf.feature_importances_, index=X_selected_train.columns).sort_values(ascending=False)[:10]
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=importances.index)
plt.title("Top 10 Feature Importances in Random Forest Model")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

In [None]:
# Class Distribution Analysis
plt.figure(figsize=(10, 5))
sns.countplot(y_pred, palette="viridis")
plt.title('Distribution of Predicted Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(y_test, palette="viridis")
plt.title('Distribution of True Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Business Interpretation
print("\nBusiness Interpretation of Results:")
print(f"Test Set Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Classification Report:")
for label, metrics in class_report.items():
    if isinstance(metrics, dict):
        print(f"{label}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1-score={metrics['f1-score']:.2f}")