In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN
import fasttext

import gensim
from gensim import corpora

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [4]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_predict

In [5]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [6]:
pd.set_option('display.max_columns', None)

### Retrieving the data

#### DF 4

In [7]:
df4 = pd.read_csv('df4.csv')

In [8]:
df4.head(2)

Unnamed: 0.1,Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,1
1,1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,2


# Models

In [9]:
# Initialize lists to store evaluation metrics for each fold
dataset_used = []
model_used = []
data_balancing_technique = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

In [10]:
#combined_metrics = pd.DataFrame()

combined_metrics = pd.DataFrame(columns=['dataset', 'model', 'data balancing technique', 'fold', 'precision_1','precision_2','recall_1','recall_2','f1-score_1','f1-score_2','support_1','support_2','TP','FP','TN','FN'])

# Fast Text 
### train_supervised

## DF4

In [11]:
# Assuming 'df2' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df4.drop(columns=['target'])  # Drop the target column to get the feature columns
target = '__label__' + df4['target'].astype(str)

# Define hyperparameters
dim = 300  # Dimension of word vectors
min_count = 1  # Minimum frequency of words
loss = 'ns'  # Hierarchical softmax loss #'hs', 'ns', 'softmax'
epoch = 100  # Number of training epochs
bucket = 2000000  # Number of buckets used for hashing n-grams
word_ngrams = 1  # Maximum length of word n-grams
lr = 0.5 #0.1, 0.01, 0.001 #learning rate

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Create a new column "content" by concatenating values from selected columns
    X_train['content'] = y_train + ' ' + X_train.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    X_test['content'] = X_test.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    
    X_train = X_train['content']
    X_test = X_test['content']
    
    X_train_list = [value for value in X_train]
    
    # Save training data to a file
    with open("train_german_fast_text.txt", "w") as f:
        for line in X_train_list:
            f.write(line + "\n")

    # Training data file path
    train_data_path = "train_german_fast_text.txt"
    
    # Train the supervised model
    model = fasttext.train_supervised(input=train_data_path, dim=dim, minCount=min_count,bucket=bucket, loss=loss, epoch=epoch, wordNgrams=word_ngrams)#lr=lr, 
                                       #lr=lr,bucket=bucket,
    
    # Predict on the testing set
    # Initialize an empty list to store the predicted labels
    y_pred_list = []

    # Iterate over each data point in X_test and predict the label
    for text in X_test:
        # Predict the label for the current text
        predicted_label, _ = model.predict(text)
        # Append the predicted label to the list
        y_pred_list.append(predicted_label[0])

    # Convert the list of predicted labels to a pandas Series
    y_pred = pd.Series(y_pred_list)
    
    # Extract the numeric label using a lambda function
    y_test = y_test.apply(lambda x: int(x.split("__label__")[1]))  ## uncomment for next run
    y_pred = y_pred.apply(lambda x: int(x.split("__label__")[1]))
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred.round())
    precision = precision_score(y_test, y_pred.round(), average='weighted')
    recall = recall_score(y_test, y_pred.round(), average='weighted')
    f1 = f1_score(y_test, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred.round())
    
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF4',
    'model' : 'Fast Text - train_supervised',
    'data balancing technique' : 'None',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.75
Mean Precision: 0.75
Mean Recall: 0.75
Mean F1-Score: 0.74


In [12]:
combined_metrics[(combined_metrics['data balancing technique'] == 'None') & (combined_metrics['dataset'] == 'DF4')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF4,Fast Text - train_supervised,,1,0.855072,0.612903,0.830986,0.655172,0.842857,0.633333,71.0,29.0,59,10,19,12
1,DF4,Fast Text - train_supervised,,2,0.818182,0.695652,0.9,0.533333,0.857143,0.603774,70.0,30.0,63,14,16,7


### FastText train_supervised with df4 data and imbalance data tackling (RandomUnderSampler)

In [13]:
# Assuming 'df4' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df4.drop(columns=['target'])  # Drop the target column to get the feature columns
target = '__label__' + df4['target'].astype(str)

# Define hyperparameters
dim = 300  # Dimension of word vectors
min_count = 1  # Minimum frequency of words
loss = 'ns'  # Hierarchical softmax loss #'hs', 'ns', 'softmax'
epoch = 100  # Number of training epochs
bucket = 2000000  # Number of buckets used for hashing n-grams
word_ngrams = 1  # Maximum length of word n-grams
lr = 0.5 #0.1, 0.01, 0.001 #learning rate

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Oversampling the minority class using SMOTE
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    # Create a new column "content" by concatenating values from selected columns
    X_train_resampled['content'] = y_train_resampled + ' ' + X_train_resampled.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    X_test['content'] = X_test.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    
    X_train = X_train_resampled['content']
    X_test = X_test['content']
    
    X_train_list = [value for value in X_train]
    
    # Save training data to a file
    with open("train_german_fast_text.txt", "w") as f:
        for line in X_train_list:
            f.write(line + "\n")

    # Training data file path
    train_data_path = "train_german_fast_text.txt"
    
    # Train the supervised model
    model = fasttext.train_supervised(input=train_data_path, dim=dim, minCount=min_count,bucket=bucket, loss=loss, epoch=epoch, wordNgrams=word_ngrams)#lr=lr, 
                                       #lr=lr,bucket=bucket,
    
    # Predict on the testing set
    # Initialize an empty list to store the predicted labels
    y_pred_list = []

    # Iterate over each data point in X_test and predict the label
    for text in X_test:
        # Predict the label for the current text
        predicted_label, _ = model.predict(text)
        # Append the predicted label to the list
        y_pred_list.append(predicted_label[0])

    # Convert the list of predicted labels to a pandas Series
    y_pred = pd.Series(y_pred_list)
    
    # Extract the numeric label using a lambda function
    y_test = y_test.apply(lambda x: int(x.split("__label__")[1]))  ## uncomment for next run
    y_pred = y_pred.apply(lambda x: int(x.split("__label__")[1]))
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred.round())
    precision = precision_score(y_test, y_pred.round(), average='weighted')
    recall = recall_score(y_test, y_pred.round(), average='weighted')
    f1 = f1_score(y_test, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred.round())
    
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF4',
    'model' : 'Fast Text - train_supervised',
    'data balancing technique' : 'Random Under Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.74
Mean Recall: 0.72
Mean F1-Score: 0.72


In [14]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Under Sampler') & (combined_metrics['dataset'] == 'DF4')]#.head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
10,DF4,Fast Text - train_supervised,Random Under Sampler,1,0.923077,0.520833,0.676056,0.862069,0.780488,0.649351,71.0,29.0,48,4,25,23
11,DF4,Fast Text - train_supervised,Random Under Sampler,2,0.820896,0.545455,0.785714,0.6,0.80292,0.571429,70.0,30.0,55,12,18,15
12,DF4,Fast Text - train_supervised,Random Under Sampler,3,0.830508,0.536585,0.720588,0.6875,0.771654,0.60274,68.0,32.0,49,10,22,19
13,DF4,Fast Text - train_supervised,Random Under Sampler,4,0.878788,0.558824,0.794521,0.703704,0.834532,0.622951,73.0,27.0,58,8,19,15
14,DF4,Fast Text - train_supervised,Random Under Sampler,5,0.830189,0.425532,0.619718,0.689655,0.709677,0.526316,71.0,29.0,44,9,20,27
15,DF4,Fast Text - train_supervised,Random Under Sampler,6,0.705882,0.469388,0.580645,0.605263,0.637168,0.528736,62.0,38.0,36,15,23,26
16,DF4,Fast Text - train_supervised,Random Under Sampler,7,0.84127,0.351351,0.688312,0.565217,0.757143,0.433333,77.0,23.0,53,10,13,24
17,DF4,Fast Text - train_supervised,Random Under Sampler,8,0.894737,0.604651,0.75,0.8125,0.816,0.693333,68.0,32.0,51,6,26,17
18,DF4,Fast Text - train_supervised,Random Under Sampler,9,0.892857,0.431818,0.666667,0.76,0.763359,0.550725,75.0,25.0,50,6,19,25
19,DF4,Fast Text - train_supervised,Random Under Sampler,10,0.829787,0.509434,0.6,0.771429,0.696429,0.613636,65.0,35.0,39,8,27,26


### FastText train_supervised with df2 data and imbalance data tackling (RandomOverSampler)

In [15]:
# Assuming 'df4' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df4.drop(columns=['target'])  # Drop the target column to get the feature columns
target = '__label__' + df4['target'].astype(str)

# Define hyperparameters
dim = 300  # Dimension of word vectors
min_count = 1  # Minimum frequency of words
loss = 'ns'  # Hierarchical softmax loss #'hs', 'ns', 'softmax'
epoch = 100  # Number of training epochs
bucket = 2000000  # Number of buckets used for hashing n-grams
word_ngrams = 1  # Maximum length of word n-grams
lr = 0.5 #0.1, 0.01, 0.001 #learning rate

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Oversampling the minority class using SMOTE
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Create a new column "content" by concatenating values from selected columns
    X_train_resampled['content'] = y_train_resampled + ' ' + X_train_resampled.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    X_test['content'] = X_test.apply(lambda row: ' '.join(str(val) for val in row), axis=1)
    
    X_train = X_train_resampled['content']
    X_test = X_test['content']
    
    X_train_list = [value for value in X_train]
    
    # Save training data to a file
    with open("train_german_fast_text.txt", "w") as f:
        for line in X_train_list:
            f.write(line + "\n")

    # Training data file path
    train_data_path = "train_german_fast_text.txt"
    
    # Train the supervised model
    model = fasttext.train_supervised(input=train_data_path, dim=dim, minCount=min_count,bucket=bucket, loss=loss, epoch=epoch, wordNgrams=word_ngrams)#lr=lr, 
                                       #lr=lr,bucket=bucket,
    
    # Predict on the testing set
    # Initialize an empty list to store the predicted labels
    y_pred_list = []

    # Iterate over each data point in X_test and predict the label
    for text in X_test:
        # Predict the label for the current text
        predicted_label, _ = model.predict(text)
        # Append the predicted label to the list
        y_pred_list.append(predicted_label[0])

    # Convert the list of predicted labels to a pandas Series
    y_pred = pd.Series(y_pred_list)
    
    # Extract the numeric label using a lambda function
    y_test = y_test.apply(lambda x: int(x.split("__label__")[1]))  ## uncomment for next run
    y_pred = y_pred.apply(lambda x: int(x.split("__label__")[1]))
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred.round())
    precision = precision_score(y_test, y_pred.round(), average='weighted')
    recall = recall_score(y_test, y_pred.round(), average='weighted')
    f1 = f1_score(y_test, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred.round())
    
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF4',
    'model' : 'Fast Text - train_supervised',
    'data balancing technique' : 'Random Over Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.73
Mean Precision: 0.75
Mean Recall: 0.73
Mean F1-Score: 0.73


In [16]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Over Sampler') & (combined_metrics['dataset'] == 'DF4')]#.head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
20,DF4,Fast Text - train_supervised,Random Over Sampler,1,0.865672,0.606061,0.816901,0.689655,0.84058,0.645161,71.0,29.0,58,9,20,13
21,DF4,Fast Text - train_supervised,Random Over Sampler,2,0.842857,0.633333,0.842857,0.633333,0.842857,0.633333,70.0,30.0,59,11,19,11
22,DF4,Fast Text - train_supervised,Random Over Sampler,3,0.765432,0.684211,0.911765,0.40625,0.832215,0.509804,68.0,32.0,62,19,13,6
23,DF4,Fast Text - train_supervised,Random Over Sampler,4,0.804878,0.611111,0.90411,0.407407,0.851613,0.488889,73.0,27.0,66,16,11,7
24,DF4,Fast Text - train_supervised,Random Over Sampler,5,0.830769,0.514286,0.760563,0.62069,0.794118,0.5625,71.0,29.0,54,11,18,17
25,DF4,Fast Text - train_supervised,Random Over Sampler,6,0.722222,0.642857,0.83871,0.473684,0.776119,0.545455,62.0,38.0,52,20,18,10
26,DF4,Fast Text - train_supervised,Random Over Sampler,7,0.824324,0.384615,0.792208,0.434783,0.807947,0.408163,77.0,23.0,61,13,10,16
27,DF4,Fast Text - train_supervised,Random Over Sampler,8,0.861538,0.657143,0.823529,0.71875,0.842105,0.686567,68.0,32.0,56,9,23,12
28,DF4,Fast Text - train_supervised,Random Over Sampler,9,0.861111,0.535714,0.826667,0.6,0.843537,0.566038,75.0,25.0,62,10,15,13
29,DF4,Fast Text - train_supervised,Random Over Sampler,10,0.742857,0.566667,0.8,0.485714,0.77037,0.523077,65.0,35.0,52,18,17,13


## Performance metrics for the Linear Regression

In [17]:
combined_metrics

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF4,Fast Text - train_supervised,,1,0.855072,0.612903,0.830986,0.655172,0.842857,0.633333,71.0,29.0,59,10,19,12
1,DF4,Fast Text - train_supervised,,2,0.818182,0.695652,0.9,0.533333,0.857143,0.603774,70.0,30.0,63,14,16,7
2,DF4,Fast Text - train_supervised,,3,0.759036,0.705882,0.926471,0.375,0.834437,0.489796,68.0,32.0,63,20,12,5
3,DF4,Fast Text - train_supervised,,4,0.809524,0.6875,0.931507,0.407407,0.866242,0.511628,73.0,27.0,68,16,11,5
4,DF4,Fast Text - train_supervised,,5,0.814286,0.533333,0.802817,0.551724,0.808511,0.542373,71.0,29.0,57,13,16,14
5,DF4,Fast Text - train_supervised,,6,0.702703,0.615385,0.83871,0.421053,0.764706,0.5,62.0,38.0,52,22,16,10
6,DF4,Fast Text - train_supervised,,7,0.807692,0.363636,0.818182,0.347826,0.812903,0.355556,77.0,23.0,63,15,8,14
7,DF4,Fast Text - train_supervised,,8,0.828571,0.666667,0.852941,0.625,0.84058,0.645161,68.0,32.0,58,12,20,10
8,DF4,Fast Text - train_supervised,,9,0.858974,0.636364,0.893333,0.56,0.875817,0.595745,75.0,25.0,67,11,14,8
9,DF4,Fast Text - train_supervised,,10,0.733333,0.6,0.846154,0.428571,0.785714,0.5,65.0,35.0,55,20,15,10


In [18]:
# Assuming df is your DataFrame
combined_metrics.to_csv('Output Data/Fast Text - train_supervised2.csv')