In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

import gensim
from gensim import corpora

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [4]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_predict

In [5]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [6]:
pd.set_option('display.max_columns', None)

### Retrieving the data

#### DF 5

In [7]:
df5 = pd.read_csv('df5.csv')

In [8]:
df5.head(2)

Unnamed: 0.1,Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,0,0,0,4,4,1,4,4,3,2,0,3,0,9,2,1,1,2,0,1,0,1
1,1,1,7,2,4,8,0,2,1,1,0,1,0,0,2,1,0,2,0,0,0,2


#### DF 5c

In [9]:
df5c = pd.read_csv('df5c.csv')

In [10]:
df5c.head(2)

Unnamed: 0.1,Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,0,0,6,4,4,1169,4,4,4,2,0,4,0,67,2,1,2,2,1,1,0,1
1,1,1,48,2,4,5951,0,2,2,1,0,2,0,22,2,1,1,2,1,0,0,2


# Models

In [11]:
# Initialize lists to store evaluation metrics for each fold
dataset_used = []
model_used = []
data_balancing_technique = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

In [12]:
#combined_metrics = pd.DataFrame()

combined_metrics = pd.DataFrame(columns=['dataset', 'model', 'data balancing technique', 'fold', 'precision_1','precision_2','recall_1','recall_2','f1-score_1','f1-score_2','support_1','support_2','TP','FP','TN','FN'])

# Logistic Regression

## DF5

In [13]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'None',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.75
Mean Precision: 0.74
Mean Recall: 0.75
Mean F1-Score: 0.74


In [14]:
combined_metrics[(combined_metrics['data balancing technique'] == 'None') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF5,Logistic Regression,,1,0.820513,0.681818,0.901408,0.517241,0.85906,0.588235,71.0,29.0,64,14,15,7
1,DF5,Logistic Regression,,2,0.764706,0.666667,0.928571,0.333333,0.83871,0.444444,70.0,30.0,65,20,10,5


### Logistic Regression with df5 data and imbalance data tackling (SMOTE)

In [15]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTE',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.74
Mean Precision: 0.74
Mean Recall: 0.74
Mean F1-Score: 0.74


In [16]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTE') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
10,DF5,Logistic Regression,SMOTE,1,0.9,0.575,0.760563,0.793103,0.824427,0.666667,71.0,29.0,54,6,23,17
11,DF5,Logistic Regression,SMOTE,2,0.808824,0.53125,0.785714,0.566667,0.797101,0.548387,70.0,30.0,55,13,17,15


### Logistic Regression with df5 data and imbalance data tackling (RandomUnderSampler)

In [17]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Undersampling the majority class
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'Random Under Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.73
Mean Precision: 0.75
Mean Recall: 0.73
Mean F1-Score: 0.73


In [18]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Under Sampler') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
20,DF5,Logistic Regression,Random Under Sampler,1,0.898305,0.560976,0.746479,0.793103,0.815385,0.657143,71.0,29.0,53,6,23,18
21,DF5,Logistic Regression,Random Under Sampler,2,0.826087,0.580645,0.814286,0.6,0.820144,0.590164,70.0,30.0,57,12,18,13


### Logistic Regression with df5 data and imbalance data tackling (RandomOverSampler)

In [19]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Oversampling the majority class
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'Random Over Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.73
Mean Precision: 0.75
Mean Recall: 0.73
Mean F1-Score: 0.73


In [20]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Over Sampler') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
30,DF5,Logistic Regression,Random Over Sampler,1,0.913793,0.571429,0.746479,0.827586,0.821705,0.676056,71.0,29.0,53,5,24,18
31,DF5,Logistic Regression,Random Over Sampler,2,0.790323,0.447368,0.7,0.566667,0.742424,0.5,70.0,30.0,49,13,17,21


### Logistic Regression with df5 data and imbalance data tackling (SMOTEENN)

In [21]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTEENN for combined over and undersampling
    smote_enn = SMOTEENN(random_state=42)
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTEENN',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.72


In [22]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTEENN') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
40,DF5,Logistic Regression,SMOTEENN,1,0.94,0.52,0.661972,0.896552,0.77686,0.658228,71.0,29.0,47,3,26,24
41,DF5,Logistic Regression,SMOTEENN,2,0.789474,0.418605,0.642857,0.6,0.708661,0.493151,70.0,30.0,45,12,18,25


### Logistic Regression with df5 data and imbalance data tackling (SMOTETOMEK)

In [23]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTETomek for combined over and undersampling
    SMOTE_Tomek = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = SMOTE_Tomek.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTETomek',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.72


In [24]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTETomek') & (combined_metrics['dataset'] == 'DF5')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
50,DF5,Logistic Regression,SMOTETomek,1,0.887097,0.578947,0.774648,0.758621,0.827068,0.656716,71.0,29.0,55,7,22,16
51,DF5,Logistic Regression,SMOTETomek,2,0.80597,0.515152,0.771429,0.566667,0.788321,0.539683,70.0,30.0,54,13,17,16


## DF5c - Logistic Regression

In [25]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'None',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.73
Mean Precision: 0.75
Mean Recall: 0.73
Mean F1-Score: 0.73


In [26]:
combined_metrics[(combined_metrics['data balancing technique'] == 'None') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
60,DF5c,Logistic Regression,,1,0.802469,0.684211,0.915493,0.448276,0.855263,0.541667,71.0,29.0,65,16,13,6
61,DF5c,Logistic Regression,,2,0.767442,0.714286,0.942857,0.333333,0.846154,0.454545,70.0,30.0,66,20,10,4


### Logistic Regression with df5c data and imbalance data tackling (SMOTE)

In [27]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTE',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.73


In [28]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTE') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
70,DF5c,Logistic Regression,SMOTE,1,0.919355,0.631579,0.802817,0.827586,0.857143,0.716418,71.0,29.0,57,5,24,14
71,DF5c,Logistic Regression,SMOTE,2,0.786667,0.56,0.842857,0.466667,0.813793,0.509091,70.0,30.0,59,16,14,11


### Logistic Regression with df5c data and imbalance data tackling (Random Under Sampling)

In [29]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply RUS to the training data
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'Random Under Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.73


In [30]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Under Sampler') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
80,DF5c,Logistic Regression,Random Under Sampler,1,0.915254,0.585366,0.760563,0.827586,0.830769,0.685714,71.0,29.0,54,5,24,17
81,DF5c,Logistic Regression,Random Under Sampler,2,0.819672,0.487179,0.714286,0.633333,0.763359,0.550725,70.0,30.0,50,11,19,20


### Logistic Regression with df5c data and imbalance data tackling (Random Over Sampling)

In [31]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply ROS to the training data
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'Random Over Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.73


In [32]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Over Sampler') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
90,DF5c,Logistic Regression,Random Over Sampler,1,0.918033,0.615385,0.788732,0.827586,0.848485,0.705882,71.0,29.0,56,5,24,15
91,DF5c,Logistic Regression,Random Over Sampler,2,0.822581,0.5,0.728571,0.633333,0.772727,0.558824,70.0,30.0,51,11,19,19


### Logistic Regression with df5c data and imbalance data tackling (SMOTEENN)

In [33]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTEENN to the training data
    SMOTEEnn = SMOTEENN(random_state=42)
    X_train_resampled, y_train_resampled = SMOTEEnn.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTEENN',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.73


In [34]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTEENN') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
100,DF5c,Logistic Regression,SMOTEENN,1,0.920635,0.648649,0.816901,0.827586,0.865672,0.727273,71.0,29.0,58,5,24,13
101,DF5c,Logistic Regression,SMOTEENN,2,0.818182,0.529412,0.771429,0.6,0.794118,0.5625,70.0,30.0,54,12,18,16


### Logistic Regression with df5c data and imbalance data tackling (SMOTETomek)

In [35]:
# Assuming 'df5' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Initialize Logistic Regression model
model = LogisticRegression()

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Apply SMOTEENN to the training data
    SMOTETOMEK = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = SMOTETOMEK.fit_resample(X_train, y_train)
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF5c',
    'model' : 'Logistic Regression',
    'data balancing technique' : 'SMOTETomek',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.75
Mean Recall: 0.72
Mean F1-Score: 0.73


In [36]:
combined_metrics[(combined_metrics['data balancing technique'] == 'SMOTETomek') & (combined_metrics['dataset'] == 'DF5c')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
110,DF5c,Logistic Regression,SMOTETomek,1,0.892308,0.628571,0.816901,0.758621,0.852941,0.6875,71.0,29.0,58,7,22,13
111,DF5c,Logistic Regression,SMOTETomek,2,0.805556,0.571429,0.828571,0.533333,0.816901,0.551724,70.0,30.0,58,14,16,12


## Performance metrics for the Logistic regression

In [37]:
combined_metrics

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF5,Logistic Regression,,1,0.820513,0.681818,0.901408,0.517241,0.859060,0.588235,71.0,29.0,64,14,15,7
1,DF5,Logistic Regression,,2,0.764706,0.666667,0.928571,0.333333,0.838710,0.444444,70.0,30.0,65,20,10,5
2,DF5,Logistic Regression,,3,0.766234,0.608696,0.867647,0.437500,0.813793,0.509091,68.0,32.0,59,18,14,9
3,DF5,Logistic Regression,,4,0.790698,0.642857,0.931507,0.333333,0.855346,0.439024,73.0,27.0,68,18,9,5
4,DF5,Logistic Regression,,5,0.769231,0.500000,0.845070,0.379310,0.805369,0.431373,71.0,29.0,60,18,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,DF5c,Logistic Regression,SMOTETomek,6,0.758065,0.605263,0.758065,0.605263,0.758065,0.605263,62.0,38.0,47,15,23,15
116,DF5c,Logistic Regression,SMOTETomek,7,0.836066,0.333333,0.662338,0.565217,0.739130,0.419355,77.0,23.0,51,10,13,26
117,DF5c,Logistic Regression,SMOTETomek,8,0.838710,0.578947,0.764706,0.687500,0.800000,0.628571,68.0,32.0,52,10,22,16
118,DF5c,Logistic Regression,SMOTETomek,9,0.875000,0.472222,0.746667,0.680000,0.805755,0.557377,75.0,25.0,56,8,17,19


In [38]:
# Assuming df is your DataFrame
combined_metrics.to_csv('Output Data/Logistic Regression.csv')