# **Numerical APTOS2019 Dataset Notebook** 
###### _Dataset obtained from: https://www.kaggle.com/datasets/mariaherrerot/aptos2019/data?select=valid.csv_

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from PIL.ExifTags import TAGS
import os
import numpy as np
import cv2
from skimage import io, color
from skimage.feature import local_binary_pattern
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor

### Data Exploration:

In [None]:
# Viewing the first 5 rows of the 'train' csv file

feature_extraction_csv_file_path = os.path.join('..', '..', 'Data', 'APTOS-2019 Dataset', 'high_res_features_train.csv')

feature_extraction_df = pd.read_csv(feature_extraction_csv_file_path)

feature_extraction_df.head(6)

In [None]:
# The distrobution of the training data is given below

feature_extraction_df.describe()

In [None]:
# Determining if there are any null values
feature_extraction_df.info()


In [None]:
# Check for missing values
print("\nMissing values in each column:")
display(feature_extraction_df.isnull().sum())

In [None]:
# Check the data types of each column
print("\nData types of each column:")
display(feature_extraction_df.dtypes)

In [None]:
# Plot histograms for numerical features
print("\nHistograms for numerical features:")
numerical_columns = ['Exudates', 'Oedema', 'Vessel Count', 'Heamotomas', 'Diabetes Status']
feature_extraction_df[numerical_columns].hist(bins=15, figsize=(10, 6), layout=(2, 3))
plt.tight_layout()
plt.show()

In [None]:
# Plot histograms for each numerical feature, separating by 'Diabetes Status'
print("\nHistograms for numerical features:")

# List of numerical columns (excluding the target column 'Diabetes Status')
numerical_columns = ['Exudates', 'Oedema', 'Vessel Count', 'Heamotomas']

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

# Loop through each numerical column and plot histograms for 'Diabetes Status = 0' and 'Diabetes Status = 1'
for i, col in enumerate(numerical_columns):
    axes[i].hist(
        feature_extraction_df[feature_extraction_df['Diabetes Status'] == 0][col], 
        bins=15, alpha=0.7, label='Non-Diabetic (0)', color='blue'
    )
    axes[i].hist(
        feature_extraction_df[feature_extraction_df['Diabetes Status'] == 1][col], 
        bins=15, alpha=0.7, label='Diabetic (1)', color='orange'
    )
    axes[i].set_title(f'{col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

plt.tight_layout()
plt.show()


In [None]:
# Plot correlations between features
print("\nCorrelation matrix:")
correlation_matrix = feature_extraction_df[numerical_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Check for class distribution in the target column (e.g., 'Diabetes Status')
print("\nClass distribution of 'Diabetes Status':")
print(feature_extraction_df['Diabetes Status'].value_counts())

In [None]:
# Visualize the relationship between 'Diabetes Status' and other features
sns.pairplot(feature_extraction_df, hue='Diabetes Status', vars=numerical_columns[:-1])
plt.show()

### Preprocessing:

In [None]:
# Load the dataset (replace 'path_to_your_csv_file.csv' with the actual file path)
#data = os.path.join('..', '..', 'Data', 'APTOS-2019 Dataset', 'features_500_res.csv')
train_data = os.path.join('..', '..', 'Data', 'APTOS-2019 Dataset', 'high_res_features_train.csv')
test_data = os.path.join('..', '..', 'Data', 'APTOS-2019 Dataset', 'high_res_features_test.csv')

train_data = pd.read_csv(train_data)
test_data = pd.read_csv(test_data)

# Step 2: Separate features (X) and labels (y)
X = train_data.drop(columns=['Image Name', 'Diabetes Status'])  # Drop 'Image Name' as it's not a useful feature
y = train_data['Diabetes Status']  # This is the target variable

X_test = test_data.drop(columns=['Image Name', 'Diabetes Status'])  # Drop 'Image Name' as it's not a useful feature
y_test = test_data['Diabetes Status']  # This is the target variable

# Step 3: Split the data into train and test sets (70% train, 30% test)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")


Standardise Dataset:

In [None]:
# Step 4: Standardize the numerical features
scaler = StandardScaler() 

# Fit the scaler on the training data and transform both the train and val data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

x_test_scaled = scaler.fit_transform(X_test)

# Optional: Convert the scaled data back into DataFrame (for readability purposes)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)

x_test_scaled = pd.DataFrame(x_test_scaled, columns=X.columns)

# Display the first few rows of the scaled training data
print("\nScaled training data:")
display(X_train_scaled.head())

Removing Oedema from the standardised dataset:

In [None]:
# Remove the 'Oedema' feature from both the training and validation sets
X_train_scaled = X_train_scaled.drop(columns=['Oedema'])
X_val_scaled = X_val_scaled.drop(columns=['Oedema'])
x_test_scaled = x_test_scaled.drop(columns=['Oedema'])

# Display the first few rows of the updated training data
print("\nTraining data after removing 'Oedema':")
display(X_train_scaled.head())

# Display the first few rows of the updated validation data
print("\nValidation data after removing 'Oedema':")
display(X_val_scaled.head())


Removing Oedema from the Original Dataset:

In [None]:
# Remove the 'Oedema' feature from both the training and validation sets
X_train = X_train.drop(columns=['Oedema'])
X_val = X_val.drop(columns=['Oedema'])
X_test = X_test.drop(columns=['Oedema'])

### Machine Learning Models:

##### Reuseable Functions for Plotting Model Statistics:

Function for Plotting Classifier Statistics:

In [22]:
# For tree-based classifiers, plots the confusion matrix, histogram of feature importance and ROC + Precision-Recall Curves

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import numpy as np
import seaborn as sns

def plot_model_evaluation_statistics(model, X_val, y_val):
    # Predict probabilities
    y_pred_prob = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)

    # Confusion Matrix
    plt.figure(figsize=(6, 4))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val, y_pred_prob)

    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.show()

    # Feature Importance
    plt.figure(figsize=(6, 4))
    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[::-1] # Sorts feature importances in descending order and get the indices
    feature_names_sorted = X_val.columns[indices]  # Arranges feature names according to the indices

    plt.title('Feature Importances')
    plt.bar(range(X_val.shape[1]), feature_importances[indices], color="r", align="center")
    plt.xticks(range(X_val.shape[1]), feature_names_sorted, rotation=90)
    plt.xlim([-1, X_val.shape[1]])
    plt.show()

In [23]:
# For Linear-based Classifiers, plots the confusion matrix, histogram of feature importance and ROC + Precision-Recall Curves

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import numpy as np
import seaborn as sns

def plot_linear_model_evaluation_statistics(model, X_val, y_val):
    # Predict probabilities
    y_pred_prob = model.predict_proba(X_val)[:, 1]

    # Predict class labels
    y_pred = model.predict(X_val)

    # Confusion Matrix
    plt.figure(figsize=(6, 4))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val, y_pred_prob)

    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.show()

    # Coefficients for Linear Model
    if hasattr(model, 'coef_'):
        plt.figure(figsize=(6, 4))
        coefficients = model.coef_[0]
        indices = np.argsort(np.abs(coefficients))[::-1]
        feature_names_sorted = X_val.columns[indices]

        plt.title('Feature Coefficients')
        plt.bar(range(len(indices)), coefficients[indices], color="r", align="center")
        plt.xticks(range(len(indices)), feature_names_sorted, rotation=90)
        plt.xlabel('Features')
        plt.ylabel('Coefficient Value')
        plt.tight_layout()
        plt.show()

Function for Plotting Regressor Statistics:

In [24]:
# Creates the same plots as done for the classifier models, however, also plots the histogram of predictions and uses the cut-off threshold to make predictions binary

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, accuracy_score
import numpy as np
import seaborn as sns

def plot_regression_evaluation_statistics(model, X_val, y_val, cutoff_threshold=0.5):
    # Continuous predictions
    y_pred_continuous = model.predict(X_val)

    # Convert continuous predictions to binary using the cutoff threshold
    y_pred_binary = np.where(y_pred_continuous > cutoff_threshold, 1, 0)

    # Calculate accuracy for the binary predictions
    accuracy = accuracy_score(y_val, y_pred_binary)
    print(f"Validation Accuracy (with cutoff {cutoff_threshold}): {accuracy:.4f}")

    # Confusion Matrix
    plt.figure(figsize=(6, 4))
    cm = confusion_matrix(y_val, y_pred_binary)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_continuous)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val, y_pred_continuous)

    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.show()

    # Feature Importance (if model supports it - Bayes does not)
    plt.figure(figsize=(6, 4))
    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[::-1]
    feature_names_sorted = X_val.columns[indices]

    plt.title('Feature Importances')
    plt.bar(range(X_val.shape[1]), feature_importances[indices], color="r", align="center")
    plt.xticks(range(X_val.shape[1]), feature_names_sorted, rotation=90)
    plt.xlim([-1, X_val.shape[1]])
    plt.show()

Function for Plotting Neural Network Statistics: 

In [25]:
# Creates the confusion matrix, ROC + Area Under Curve graph, precision-recall curve as well as the loss and accuracy graphs as epochs increase.

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report
import numpy as np

def analyze_neural_network_performance(model, X_train, y_train, X_val, y_val, history=None):
    # Predict probabilities for the validation set
    y_pred_prob = model.predict(X_val).squeeze()
    y_pred = (y_pred_prob > 0.5).astype(int)

    # Confusion Matrix
    plt.figure(figsize=(6, 4))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    # ROC Curve and AUC
    fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val, y_pred_prob)
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, color='blue', lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

    # Classification Report
    print(classification_report(y_val, y_pred, target_names=['Class 0', 'Class 1']))

    # Training Process: Loss and Accuracy
    if history is not None:
        # Plot training & validation accuracy values
        plt.figure(figsize=(6, 4))
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.title('Model Accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Val'], loc='upper left')

        # Plot training & validation loss values
        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Model Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Val'], loc='upper left')
        plt.tight_layout()
        plt.show()

#### 1. Logistic Regression:

Training the Model:

In [None]:
# Creating a logistic regression model and testing it's accuracy on the validation data

LR_model = LogisticRegression(max_iter=1000) # instantiate the model

#model.fit(X_train, y_train) # fitting the model on the training set
LR_model.fit(X_train_scaled, y_train)

y_pred = LR_model.predict(X_val_scaled) # making predictions on the validation set

accuracy = accuracy_score(y_val, y_pred) # calculate the accuracy of the model
print(f"Validation Accuracy: {accuracy:.4f}")

Hyperparameter Tuning using Optuna:

In [None]:
import optuna
import matplotlib.pyplot as plt
from optuna.visualization.matplotlib import plot_optimization_history
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Objective function to optimize with Optuna
def objective(trial):
    # Suggest values for hyperparameters
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga', 'lbfgs'])
    C = trial.suggest_float('C', 1e-6, 1e3, log=True)  # Regularization strength
    max_iter = trial.suggest_int('max_iter', 100, 2000)
    
    # Create the Logistic Regression model with suggested hyperparameters
    model = LogisticRegression(solver=solver, C=C, max_iter=max_iter, random_state=42)
    
    # Perform cross-validation to get an evaluation score
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy').mean()
    
    # Append the validation accuracy for each trial to the list
    validation_accuracies.append(score)
    
    return score

# Create a list to store validation accuracies for each trial
validation_accuracies = []

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=600)

# Output the best hyperparameters found by Optuna
print("Best hyperparameters:")
print(study.best_params)

# Train the Logistic Regression model using the best hyperparameters
best_params = study.best_params
LR_model_HP = LogisticRegression(**best_params, random_state=42)
LR_model_HP.fit(X_train_scaled, y_train)

# Make predictions on the validation set
y_val_pred = LR_model_HP.predict(X_val_scaled)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy with tuned hyperparameters: {accuracy:.4f}")

# Plot the optimization history (from Optuna)
opt_history_fig = plot_optimization_history(study)
plt.show()

"""
# Plot validation accuracy over time (custom plot)
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(validation_accuracies) + 1), validation_accuracies, marker='o', linestyle='-', color='b')
plt.title('Validation Accuracy Over Trials')
plt.xlabel('Trial')
plt.ylabel('Validation Accuracy')
plt.grid(True)
plt.show()
"""

Validation Statistics:

In [None]:
#plot_linear_model_evaluation_statistics(LR_model_HP, X_val_scaled, y_val)
plot_linear_model_evaluation_statistics(LR_model, X_val_scaled, y_val)

Testing Statistics:

In [None]:
# Make predictions on the testing set
y_test_pred = LR_model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {accuracy:.4f}")

In [None]:
plot_linear_model_evaluation_statistics(LR_model, x_test_scaled, y_test)

#### 2. Random Forrest:

##### 2.1 Randdom Forrest Classifier: 

Training the Model:

In [None]:
# Creating a random forest model and testing it's accuracy on the validation data

random_forest_model = RandomForestClassifier(n_estimators=1000, random_state=42)  # n_estimators is the number of trees

random_forest_model.fit(X_train_scaled, y_train)

y_pred = random_forest_model.predict(X_val_scaled)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Statistics:

In [None]:
plot_model_evaluation_statistics(random_forest_model, X_val_scaled, y_val)

Testing Statistics:

In [None]:
# Make predictions on the testing set
y_test_pred = random_forest_model.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {accuracy:.4f}")

In [None]:
plot_model_evaluation_statistics(random_forest_model, x_test_scaled, y_test)

##### 2.2 Random Forrest Regressor with Cutoff:

In [None]:
# Initialise random forrest regressor
randForest_regressor = RandomForestRegressor(n_estimators=1000, random_state=42)

# Train the model
randForest_regressor.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred = randForest_regressor.predict(X_val_scaled)

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")

# Calculate R^2 score
r2_score = randForest_regressor.score(X_val_scaled, y_val)
print(f"Validation R^2 Score: {r2_score:.4f}")

Hyperparameter Tuning using Optuna:

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Objective function for Optuna to minimize
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    # Create the RandomForestRegressor model with suggested hyperparameters
    randForest_regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    # Perform cross-validation (you can adjust `cv` for more or fewer folds)
    score = cross_val_score(randForest_regressor, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

    # Return the negative RMSE (because Optuna maximizes by default)
    return score

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Output the best hyperparameters found by Optuna
print("Best hyperparameters:")
print(study.best_params)

# Train the RandomForestRegressor model using the best hyperparameters
best_params = study.best_params
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_val_pred = best_model.predict(X_val_scaled)

# Calculate RMSE on the validation set
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE with tuned hyperparameters: {rmse:.4f}")

# Calculate R² score
r2_score = best_model.score(X_val_scaled, y_val)
print(f"Validation R² Score with tuned hyperparameters: {r2_score:.4f}")


Plot the Predictions to Understand the Binary Clustering and Cutoff Point:

In [None]:
# Plot histogram of predictions
plt.figure(figsize=(8, 6))
plt.hist(y_pred, bins=20, color='blue', alpha=0.7)
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.title('Histogram of Predictions')
plt.grid(True)
plt.show()

In [None]:
cutoff_threshold_val = np.median(y_pred)
print(cutoff_threshold_val)

In [48]:
binary_predictions = np.where(y_pred > cutoff_threshold_val, 1, 0)

Validation Statistics:

In [None]:
plot_regression_evaluation_statistics(randForest_regressor, X_val_scaled, y_val, cutoff_threshold=cutoff_threshold_val)

Testing Statistics:

In [None]:
# Make predictions on the testing set
y_test_pred = randForest_regressor.predict(x_test_scaled)

plot_regression_evaluation_statistics(randForest_regressor, x_test_scaled, y_test, cutoff_threshold=cutoff_threshold_val)

#### 3. Naive Bayes:

##### 3.1 Multinomial Naive Bayes:

Training the Model:

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
mnb = MultinomialNB()

# Train the model
mnb.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = mnb.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Statistics:

In [None]:
plot_linear_model_evaluation_statistics(mnb, X_val, y_val)

Testing Statistics:

In [None]:
# Predict on the validation set
y_test_pred = mnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
plot_linear_model_evaluation_statistics(mnb, X_test, y_test)

##### 3.2 Gaussian Naive Bayes:

Training the Model:

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train the Naive Bayes model on the training set
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train_scaled, y_train)

# Make predictions on testing data set
y_pred = naive_bayes_model.predict(X_val_scaled)

# Find the accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Statistics:

In [None]:
plot_linear_model_evaluation_statistics(naive_bayes_model, X_val_scaled, y_val)

Testing Statistics:

In [None]:
# Make predictions on testing data set
y_pred = naive_bayes_model.predict(x_test_scaled)

# Find the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
plot_linear_model_evaluation_statistics(naive_bayes_model, x_test_scaled, y_test)

#### 4. XGBoost:

##### 4.1 XGBoost Classifier:

Training the Model:

In [None]:
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier()

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred = xgb_model.predict(X_val_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Hyper-parameter Tuning using Optuna:

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Objective function for Optuna to minimize
def objective(trial):
    # Suggest values for hyperparameters
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',  # Assuming binary classification
        'eval_metric': 'logloss',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Create the XGBoost model with suggested hyperparameters
    xgb_model = xgb.XGBClassifier(**param, use_label_encoder=False)

    # Perform cross-validation
    score = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5, scoring='accuracy').mean()

    return score

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Output the best hyperparameters found by Optuna
print("Best hyperparameters:")
print(study.best_params)

# Train the XGBoost model using the best hyperparameters
best_params = study.best_params
xgb_best_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
xgb_best_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_val_pred = xgb_best_model.predict(X_val_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy with tuned hyperparameters: {accuracy:.4f}")

Validation Statistics: 

In [None]:
plot_model_evaluation_statistics(xgb_best_model, X_val_scaled, y_val)

Testing Statistics:

In [None]:
# Predict on the validation set
y_test_pred = xgb_best_model.predict(x_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Validation Accuracy with tuned hyperparameters: {accuracy:.4f}")

In [None]:
plot_model_evaluation_statistics(xgb_best_model, x_test_scaled, y_test)

##### 4.2 XGBoost Regressor:

Training the model:

In [None]:
# Initialize XGBoost regressor
xgb_regressor = xgb.XGBRegressor()

# Train the model
xgb_regressor.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred = xgb_regressor.predict(X_val_scaled)

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")

# Calculate R^2 score
r2_score = xgb_regressor.score(X_val_scaled, y_val)
print(f"Validation R^2 Score: {r2_score:.4f}")

In [None]:
# Plot histogram of predictions
plt.figure(figsize=(8, 6))
plt.hist(y_pred, bins=20, color='blue', alpha=0.7)
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.title('Histogram of Predictions')
plt.grid(True)
plt.show()

In [None]:
cutoff_threshold_val = np.median(y_pred)
print(cutoff_threshold_val)

In [14]:
binary_predictions = np.where(y_pred > cutoff_threshold_val, 1, 0)

Validation Statistics:

In [None]:
plot_regression_evaluation_statistics(xgb_regressor, X_val_scaled, y_val, cutoff_threshold=cutoff_threshold_val)

Testing Statistics:

In [None]:
# Make predictions on the testing set
y_test_pred = xgb_regressor.predict(x_test_scaled)

plot_regression_evaluation_statistics(xgb_regressor, x_test_scaled, y_test, cutoff_threshold=cutoff_threshold_val)

#### 5. Gradient Boosting:

##### 5.1 Gradient Boosting Classifier:

##### 5.2 Gradient Boosting Regressor:

#### 6. SVM:

##### 6.1 SVM with Linear Kernel:

##### 6.2 SVM with RBG Kernel:

#### 7. Neural Network:

#### 8. Combination Model:

#### 2. SVM Model with rbg kernel:

In [None]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Create the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Step 2: Train the model on the scaled training data
svm_model.fit(X_train_scaled, y_train)

# Step 3: Make predictions on the validation set
y_val_pred = svm_model.predict(X_val_scaled)

# Step 4: Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Step 5: Print a classification report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Step 6: Plot a confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

#### 1.2. SVM with Linear Kernel:

In [None]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Step 1: Create the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Step 2: Train the model on the scaled training data
svm_model.fit(X_train_scaled, y_train)

# Step 3: Make predictions on the validation set
y_val_pred = svm_model.predict(X_val_scaled)

# Step 4: Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Step 5: Print a classification report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Step 6: Plot a confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# Step 7: Extract and plot feature importance for the linear SVM
# Extract the feature importance (absolute value of the coefficients)
feature_importance = np.abs(svm_model.coef_).flatten()

# Create a DataFrame to hold feature names and their importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 8: Plot the feature importance
plt.figure(figsize=(6, 4))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importance in SVM (Linear Kernel)')
plt.gca().invert_yaxis()  # Most important feature at the top
plt.show()

