In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN
import fasttext

import gensim
from gensim import corpora

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [4]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from gensim.models import FastText
import numpy as np
np.random.seed(42)

In [5]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_predict

In [6]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [7]:
pd.set_option('display.max_columns', None)

### Retrieving the data

#### DF 7

In [8]:
#df7 = pd.read_csv('df7.csv')
df7 = pd.read_csv('df7.csv', index_col=0)

In [9]:
df7.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target,content
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,__label__1,__label__1 A11 A20 A34 A43 A51 A65 A75 A84 A93...
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,__label__2,__label__2 A12 A27 A32 A43 A58 A61 A73 A82 A92...


# Models

In [10]:
# Initialize lists to store evaluation metrics for each fold
dataset_used = []
model_used = []
data_balancing_technique = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

In [11]:
#combined_metrics = pd.DataFrame()

combined_metrics = pd.DataFrame(columns=['dataset', 'model', 'data balancing technique', 'fold', 'precision_1','precision_2','recall_1','recall_2','f1-score_1','f1-score_2','support_1','support_2','TP','FP','TN','FN'])

# Fast Text 
### Linear Regression

## DF7

In [12]:
# Assuming 'df7' is your DataFrame with features and the target column
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    X_train = X_train.apply(preprocess)
    X_test = X_test.apply(preprocess)

    sentences = [sentence.split() for sentence in X_train]
    
    # Train the FastText model
    fasttext_model = FastText(sentences, vector_size=100, window=20, min_count=2, workers=4, seed=42)
    
    def vectorize(sentence):
        words = sentence.split()
        words_vecs = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0)

    X_train = np.array([vectorize(sentence) for sentence in X_train])
    X_test = np.array([vectorize(sentence) for sentence in X_test])
    
    #clf = LogisticRegression()
    clf = LinearRegression()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    y_pred = np.round(y_pred)
    
    y_pred[y_pred <= 0] = 1
    y_pred[y_pred >= 2] = 2
    
    # Model evaluation
    accuracy = accuracy_score(y_test, y_pred.round())
    precision = precision_score(y_test, y_pred.round(), average='weighted')
    recall = recall_score(y_test, y_pred.round(), average='weighted')
    f1 = f1_score(y_test, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred.round())
    
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF7',
    'model' : 'Fast Text - Linear Regression',
    'data balancing technique' : 'None',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.72
Mean Precision: 0.72
Mean Recall: 0.72
Mean F1-Score: 0.71


In [13]:
combined_metrics[(combined_metrics['data balancing technique'] == 'None') & (combined_metrics['dataset'] == 'DF7')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF7,Fast Text - Linear Regression,,1,0.828571,0.566667,0.816901,0.586207,0.822695,0.576271,71.0,29.0,58,12,17,13
1,DF7,Fast Text - Linear Regression,,2,0.797101,0.516129,0.785714,0.533333,0.791367,0.52459,70.0,30.0,55,14,16,15


### FastText Linear Regression with df7 data and imbalance data tackling (RandomUnderSampler)

In [14]:
# Assuming 'df7' is your DataFrame with features and the target column
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    X_train = X_train.apply(preprocess)
    X_test = X_test.apply(preprocess)

    sentences = [sentence.split() for sentence in X_train]
    
    # Train the FastText model
    fasttext_model = FastText(sentences, vector_size=100, window=20, min_count=2, workers=4, seed=42)
    
    def vectorize(sentence):
        words = sentence.split()
        words_vecs = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0)

    X_train = np.array([vectorize(sentence) for sentence in X_train])
    X_test = np.array([vectorize(sentence) for sentence in X_test])
    
    # Apply random undersampling
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    #clf = LogisticRegression()
    clf = LinearRegression()
    clf.fit(X_train_resampled, y_train_resampled)
    
    # Apply random undersampling to test data
    X_test_resampled, y_test_resampled = rus.fit_resample(X_test, y_test)
    
    y_pred = clf.predict(X_test_resampled)
    
    y_pred = np.round(y_pred)
    
    y_pred[y_pred <= 0] = 1
    y_pred[y_pred >= 2] = 2
    
    # Model evaluation
    accuracy = accuracy_score(y_test_resampled, y_pred.round())
    precision = precision_score(y_test_resampled, y_pred.round(), average='weighted')
    recall = recall_score(y_test_resampled, y_pred.round(), average='weighted')
    f1 = f1_score(y_test_resampled, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test_resampled, y_pred.round())
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test_resampled, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test_resampled, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF7',
    'model' : 'Fast Text - Linear Regression',
    'data balancing technique' : 'Random Under Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.68
Mean Precision: 0.69
Mean Recall: 0.68
Mean F1-Score: 0.67


In [15]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Under Sampler') & (combined_metrics['dataset'] == 'DF7')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
10,DF7,Fast Text - Linear Regression,Random Under Sampler,1,0.842105,0.666667,0.551724,0.896552,0.666667,0.764706,29.0,29.0,16,3,26,13
11,DF7,Fast Text - Linear Regression,Random Under Sampler,2,0.648649,0.73913,0.8,0.566667,0.716418,0.641509,30.0,30.0,24,13,17,6


### FastText Linear Regression with df7 data and imbalance data tackling (RandomOverSampler)

In [16]:
# Assuming 'df7' is your DataFrame with features and the target column
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Initialize KFold with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

# Perform k-fold cross-validation
for train_index, test_index in kf.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    X_train = X_train.apply(preprocess)
    X_test = X_test.apply(preprocess)

    sentences = [sentence.split() for sentence in X_train]
    
    # Train the FastText model
    fasttext_model = FastText(sentences, vector_size=100, window=20, min_count=2, workers=4, seed=42)
    
    def vectorize(sentence):
        words = sentence.split()
        words_vecs = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0)

    X_train = np.array([vectorize(sentence) for sentence in X_train])
    X_test = np.array([vectorize(sentence) for sentence in X_test])
    
    # Apply random undersampling
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    #clf = LogisticRegression()
    clf = LinearRegression()
    clf.fit(X_train_resampled, y_train_resampled)
    
    # Apply random undersampling to test data
    X_test_resampled, y_test_resampled = rus.fit_resample(X_test, y_test)
    
    y_pred = clf.predict(X_test_resampled)
    
    y_pred = np.round(y_pred)
    
    y_pred[y_pred <= 0] = 1
    y_pred[y_pred >= 2] = 2
    
    # Model evaluation
    accuracy = accuracy_score(y_test_resampled, y_pred.round())
    precision = precision_score(y_test_resampled, y_pred.round(), average='weighted')
    recall = recall_score(y_test_resampled, y_pred.round(), average='weighted')
    f1 = f1_score(y_test_resampled, y_pred.round(), average='weighted')
    conf_matrix = confusion_matrix(y_test_resampled, y_pred.round())
    
    # Convert classification report to DataFrame
    report_df = pd.DataFrame(classification_report(y_test_resampled, y_pred.round(), output_dict=True)).transpose()
    # Convert confusion matrix to DataFrame
    matrix_df = pd.DataFrame(confusion_matrix(y_test_resampled, y_pred.round()))
    
    # Extract metrics for class 1
    metrics_1 = report_df.loc['1', ['precision', 'recall', 'f1-score', 'support']]

    # Extract metrics for class 2
    metrics_2 = report_df.loc['2', ['precision', 'recall', 'f1-score', 'support']]

    # Extract TP, TN, FP, FN counts from the confusion matrix DataFrame
    TP = matrix_df.loc[0, 0]
    TN = matrix_df.loc[1, 1]
    FP = matrix_df.loc[1, 0]
    FN = matrix_df.loc[0, 1]
    
    new_metric_row = {
    'dataset': 'DF7',
    'model' : 'Fast Text - Linear Regression',
    'data balancing technique' : 'Random Over Sampler',
    'fold' : i,
    'precision_1': metrics_1['precision'],
    'precision_2': metrics_2['precision'],
    'recall_1': metrics_1['recall'],
    'recall_2': metrics_2['recall'],
    'f1-score_1': metrics_1['f1-score'],
    'f1-score_2': metrics_2['f1-score'],
    'support_1': metrics_1['support'],
    'support_2': metrics_2['support'],
    'TP' : TP,
    'FP' : FP,
    'TN' : TN,
    'FN' : FN
    }
    
    i = i + 1
    
    #combined_metrics = combined_metrics.append(new_metric_row, ignore_index=True)
    combined_metrics.loc[len(combined_metrics)] = new_metric_row
    
    
    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean evaluation metrics across all folds
mean_accuracy = np.mean(accuracy_scores) #sum(accuracy_scores) / len(accuracy_scores)
mean_precision = np.mean(precision_scores) #sum(precision_scores) / len(precision_scores)
mean_recall = np.mean(recall_scores) #sum(recall_scores) / len(recall_scores)
mean_f1 = np.mean(f1_scores) #sum(f1_scores) / len(f1_scores)

print('Mean Accuracy: {:.2f}'.format(mean_accuracy))
print('Mean Precision: {:.2f}'.format(mean_precision))
print('Mean Recall: {:.2f}'.format(mean_recall))
print('Mean F1-Score: {:.2f}'.format(mean_f1))

Mean Accuracy: 0.67
Mean Precision: 0.68
Mean Recall: 0.67
Mean F1-Score: 0.66


In [17]:
combined_metrics[(combined_metrics['data balancing technique'] == 'Random Over Sampler') & (combined_metrics['dataset'] == 'DF7')].head(2)

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
20,DF7,Fast Text - Linear Regression,Random Over Sampler,1,0.782609,0.685714,0.62069,0.827586,0.692308,0.75,29.0,29.0,18,5,24,11
21,DF7,Fast Text - Linear Regression,Random Over Sampler,2,0.52,0.6,0.866667,0.2,0.65,0.3,30.0,30.0,26,24,6,4


## Performance metrics for the Linear Regression

In [18]:
combined_metrics

Unnamed: 0,dataset,model,data balancing technique,fold,precision_1,precision_2,recall_1,recall_2,f1-score_1,f1-score_2,support_1,support_2,TP,FP,TN,FN
0,DF7,Fast Text - Linear Regression,,1,0.828571,0.566667,0.816901,0.586207,0.822695,0.576271,71.0,29.0,58,12,17,13
1,DF7,Fast Text - Linear Regression,,2,0.797101,0.516129,0.785714,0.533333,0.791367,0.52459,70.0,30.0,55,14,16,15
2,DF7,Fast Text - Linear Regression,,3,0.732558,0.642857,0.926471,0.28125,0.818182,0.391304,68.0,32.0,63,23,9,5
3,DF7,Fast Text - Linear Regression,,4,0.76087,0.625,0.958904,0.185185,0.848485,0.285714,73.0,27.0,70,22,5,3
4,DF7,Fast Text - Linear Regression,,5,0.765432,0.526316,0.873239,0.344828,0.815789,0.416667,71.0,29.0,62,19,10,9
5,DF7,Fast Text - Linear Regression,,6,0.705882,0.469388,0.580645,0.605263,0.637168,0.528736,62.0,38.0,36,15,23,26
6,DF7,Fast Text - Linear Regression,,7,0.837838,0.423077,0.805195,0.478261,0.821192,0.44898,77.0,23.0,62,12,11,15
7,DF7,Fast Text - Linear Regression,,8,0.7875,0.75,0.926471,0.46875,0.851351,0.576923,68.0,32.0,63,17,15,5
8,DF7,Fast Text - Linear Regression,,9,0.853333,0.56,0.853333,0.56,0.853333,0.56,75.0,25.0,64,11,14,11
9,DF7,Fast Text - Linear Regression,,10,0.725,0.65,0.892308,0.371429,0.8,0.472727,65.0,35.0,58,22,13,7


In [19]:
# Assuming df is your DataFrame
combined_metrics.to_csv('Output Data/Fast Text - Linear Regression.csv')