# Table of Contents <a class = "anchor" id = "toc"></a>
* [Data Importation](#data_impo)
* [Feature Engineering](#featureeng)
* [Classification Models](#models)
* [Evaluation](#evaluation)

# Imports

In [65]:
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')
nltk.download('opinion_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import pickle

from tqdm import tqdm
from gensim.models import Word2Vec

from sklearn.metrics import roc_curve, roc_auc_score, f1_score, classification_report, auc, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold, train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.models import Model

from sentence_transformers import SentenceTransformer
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import emoji
import langid
import regex
import string

import re
from tqdm import tqdm
import regex
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/catarinasilva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/catarinasilva/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/catarinasilva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/catarinasilva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Importation <a class = "anchor" id = "data_impo"></a>
[Back to the Table of Contents](#toc)

In [6]:
train_reviews = pd.read_csv("../data/cleaned/train_reviews_vf.csv")
test_reviews = pd.read_csv("../data/cleaned/test_reviews_vf.csv")

train= pd.read_csv("../data/cleaned/train_vf.csv")
test= pd.read_csv("../data/cleaned/test_vf.csv")

airbnb_train = pd.read_csv('../data/cleaned/airbnb_train_vf.csv')
airbnb_train_reviews = pd.read_csv('../data/cleaned/airbnb_train_reviews_cleaned.csv')
airbnb_test = pd.read_csv('../data/cleaned/airbnb_test_vf.csv')
airbnb_test_reviews = pd.read_csv('../data/cleaned/airbnb_test_reviews_cleaned.csv')

# Feature Engineering <a class = "anchor" id = "featureeng"></a>
[Back to the Table of Contents](#toc)

### Feature extraction 

Bag-of-Words (BoW): Represent each review as a vector of word frequencies.

TF-IDF (Term Frequency-Inverse Document Frequency): Weigh the importance of words based on their frequency in the document and across all documents.

Word embeddings: Represent words as dense vectors in a high-dimensional space.

#### joining the datasets

In [3]:
train_reviews_joined = train_reviews.groupby('index')['cleaned_comments'].apply(lambda x: ' '.join(x)).reset_index()
train_reviews_joined.head()

Unnamed: 0,index,cleaned_comments
0,1,cozy comfortable house stay never worry safety...
1,5,très bel appartemer magnifique voir plage mer ...
2,8,shani helpful throughout process thank answeri...
3,9,passer très bon séjour chez isabel helder appa...
4,10,outpost beautiful place stay azenhas mar prope...


In [7]:
train_joined = pd.merge(train, train_reviews_joined, on='index', how='outer')
train_joined = train_joined.join(airbnb_train[['unlisted']], how='outer')
train_joined.head()

Unnamed: 0.1,Unnamed: 0,cleaned_description,cleaned_host,index,cleaned_comments,unlisted
0,0,shared mixed room hostel shared bathroom locat...,alojamento local registro,1,cozy comfortable house stay never worry safety...,0
1,1,espaço ficar perto parque eduardo Vii saldanha...,friendly host try always around need anything ...,2,,1
2,2,trafaria house cozy familiar villa facility ne...,social person liking communicate reading trave...,3,,1
3,3,apartamento charmoso chiar largo Carmo travess...,hello portuguese love meet people around word ...,4,,0
4,4,joli appartemer bordure mer min avoir pied pla...,famille deux enfant an habité pendant plusieur...,5,très bel appartemer magnifique voir plage mer ...,0


In [8]:
train_joined.isna().sum()

Unnamed: 0                0
cleaned_description       0
cleaned_host              0
index                     0
cleaned_comments       2012
unlisted                  0
dtype: int64

In [9]:
train_joined['cleaned_comments'].fillna('empty', inplace = True)

In [10]:
train_joined.set_index('index', inplace=True)
train_joined.head()

Unnamed: 0_level_0,Unnamed: 0,cleaned_description,cleaned_host,cleaned_comments,unlisted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,shared mixed room hostel shared bathroom locat...,alojamento local registro,cozy comfortable house stay never worry safety...,0
2,1,espaço ficar perto parque eduardo Vii saldanha...,friendly host try always around need anything ...,empty,1
3,2,trafaria house cozy familiar villa facility ne...,social person liking communicate reading trave...,empty,1
4,3,apartamento charmoso chiar largo Carmo travess...,hello portuguese love meet people around word ...,empty,0
5,4,joli appartemer bordure mer min avoir pied pla...,famille deux enfant an habité pendant plusieur...,très bel appartemer magnifique voir plage mer ...,0


In [11]:
y_final = train_joined['unlisted']

In [12]:
y_final

index
1       0
2       1
3       1
4       0
5       0
       ..
6244    0
6245    0
6246    1
6247    0
6248    0
Name: unlisted, Length: 6248, dtype: int64

In [13]:
train_joined.drop('unlisted', axis=1, inplace=True)

#### Train-test split (to be able to evaluate the results)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    train_joined, y_final, stratify=y_final, random_state=42, test_size=0.25
)

In [15]:
print(f"Train shapes: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation shapes: X_val: {X_val.shape}, y_val: {y_val.shape}")

Train shapes: X_train: (4686, 4), y_train: (4686,)
Validation shapes: X_val: (1562, 4), y_val: (1562,)


In [16]:
X1 = X_train['cleaned_comments']
X2 = X_train['cleaned_host']
X3 = X_train['cleaned_description']
y = y_train

In [17]:
print(X1.shape)
print(X2.shape)
print(X3.shape)
print(y.shape)

(4686,)
(4686,)
(4686,)
(4686,)


In [18]:
y.value_counts()

unlisted
0    3405
1    1281
Name: count, dtype: int64

In [20]:
kf = StratifiedKFold(n_splits=5, shuffle= True, random_state=42)

## BoW

In [25]:
bow_vectorizer1 = CountVectorizer()
bow_vectorizer2 = CountVectorizer()
bow_vectorizer3 = CountVectorizer()

results_bow = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

In [32]:
# Perform BoW feature extraction for each feature
X1_bow = bow_vectorizer1.fit_transform(X1)
X2_bow = bow_vectorizer2.fit_transform(X2)
X3_bow = bow_vectorizer3.fit_transform(X3)

# Combine BoW features
X_combined_bow = hstack([X1_bow, X2_bow, X3_bow])

for train_index, test_index in kf.split(X_combined_bow,y):
    # Split data
    X_train_, X_test_ = X_combined_bow[train_index], X_combined_bow[test_index]
    y_train_, y_test_ = y.iloc[train_index], y.iloc[test_index]

    # Model Training and Evaluation
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_, y_train_)
    y_pred = model.predict(X_test_)

    # Calculate evaluation metrics
    results_bow['accuracy'].append(accuracy_score(y_test_, y_pred))
    results_bow['precision'].append(precision_score(y_test_, y_pred))
    results_bow['recall'].append(recall_score(y_test_, y_pred))
    results_bow['f1'].append(f1_score(y_test_, y_pred))

In [33]:
print(f"Mean Accuracy for BoW: {np.mean(results_bow['accuracy'])}")
print(f"Mean Precision for BoW: {np.mean(results_bow['precision'])}")
print(f"Mean Recall for BoW: {np.mean(results_bow['recall'])}")
print(f"Mean F1 Score for BoW: {np.mean(results_bow['f1'])}")

Mean Accuracy for BoW: 0.8404466136635447
Mean Precision for BoW: 0.7005094339773551
Mean Recall for BoW: 0.7330222519455253
Mean F1 Score for BoW: 0.7169515389423088


## TF-IDF

In [34]:
tfidf_vectorizer1 = TfidfVectorizer()
tfidf_vectorizer2 = TfidfVectorizer()
tfidf_vectorizer3 = TfidfVectorizer()

results_tfidf = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

In [35]:
# Perform TF-IDF feature extraction for each feature
X1_tfidf = tfidf_vectorizer1.fit_transform(X1)
X2_tfidf = tfidf_vectorizer2.fit_transform(X2)
X3_tfidf = tfidf_vectorizer3.fit_transform(X3)

# Combine TF-IDF features
X_combined_tfidf = hstack([X1_tfidf, X2_tfidf, X3_tfidf])

# Loop through the kfold splits
for train_index, test_index in kf.split(X_combined_tfidf,y):
    # Split data
    X_train_, X_test_ = X_combined_tfidf[train_index], X_combined_tfidf[test_index]
    y_train_, y_test_ = y.iloc[train_index], y.iloc[test_index]

    # Model Training and Evaluation
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_, y_train_)
    y_pred = model.predict(X_test_)

    # Calculate evaluation metrics
    results_tfidf['accuracy'].append(accuracy_score(y_test_, y_pred))
    results_tfidf['precision'].append(precision_score(y_test_, y_pred))
    results_tfidf['recall'].append(recall_score(y_test_, y_pred))
    results_tfidf['f1'].append(f1_score(y_test_, y_pred))

# Calculate average evaluation metrics
avg_accuracy = np.mean(results_tfidf['accuracy'])
avg_precision = np.mean(results_tfidf['precision'])
avg_recall = np.mean(results_tfidf['recall'])
avg_f1 = np.mean(results_tfidf['f1'])

In [36]:
print(f"Mean Accuracy for TF-IDF: {np.mean(results_tfidf['accuracy'])}")
print(f"Mean Precision for TF-IDF: {np.mean(results_tfidf['precision'])}")
print(f"Mean Recall for TF-IDF: {np.mean(results_tfidf['recall'])}")
print(f"Mean F1 Score for TF-IDF: {np.mean(results_tfidf['f1'])}")

Mean Accuracy for TF-IDF: 0.8819880624321599
Mean Precision for TF-IDF: 0.7586289449342438
Mean Recall for TF-IDF: 0.8345117947470817
Mean F1 Score for TF-IDF: 0.7944333556378655


## Word2Vec

In [37]:
results_word2vec = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

In [38]:
def train_word2vec_model(data):
    model = Word2Vec(data, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Convert text to Word2Vec vectors
def text_to_word2vec(text, model):
    vector = np.zeros(model.vector_size)
    count = 0
    for word in text.split():
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

# Train Word2Vec models for each column
model1 = train_word2vec_model([doc.split() for doc in X1])
model2 = train_word2vec_model([doc.split() for doc in X2])
model3 = train_word2vec_model([doc.split() for doc in X3])

# Convert text to Word2Vec vectors
X1_word2vec = np.array([text_to_word2vec(doc, model1) for doc in X1])
X2_word2vec = np.array([text_to_word2vec(doc, model2) for doc in X2])
X3_word2vec = np.array([text_to_word2vec(doc, model3) for doc in X3])

# Concatenate the features
X_word2vec = np.concatenate((X1_word2vec, X2_word2vec, X3_word2vec), axis=1)

In [39]:
classifier = LogisticRegression(max_iter = 1000)

y_array = y.to_numpy()

# Perform cross-validation
accuracies = []
for train_index, test_index in kf.split(X_word2vec, y_array):
    X_train_, X_test_ = X_word2vec[train_index], X_word2vec[test_index]
    y_train_, y_test_ = y_array[train_index], y_array[test_index]
    
    # Train the classifier
    classifier.fit(X_train_, y_train_)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test_)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test_, y_pred)
    precision = precision_score(y_test_, y_pred)
    recall = recall_score(y_test_, y_pred)
    f1 = f1_score(y_test_, y_pred)
    
    # Store the evaluation metrics in the results dictionary
    results_word2vec['accuracy'].append(accuracy)
    results_word2vec['precision'].append(precision)
    results_word2vec['recall'].append(recall)
    results_word2vec['f1'].append(f1)

In [40]:
# Calculate the average metrics across all folds
average_accuracy = np.mean(results_word2vec['accuracy'])
average_precision = np.mean(results_word2vec['precision'])
average_recall = np.mean(results_word2vec['recall'])
average_f1 = np.mean(results_word2vec['f1'])

print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average F1-score:", average_f1)

Average Accuracy: 0.8734526786709841
Average Precision: 0.7372060972650913
Average Recall: 0.8360712548638134
Average F1-score: 0.7832417324156268


## Embeddings - BERT

In [43]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')
    
def embedding(texts, batch_size=8):
    embeddings = []
    # Initialize tqdm to create the progress bar
    with tqdm(total=len(texts), desc="Embedding Progress") as pbar:
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
            with torch.no_grad():  # Disable gradient calculation
                outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            embeddings.extend(batch_embeddings.cpu().numpy())  # Move to CPU and convert to numpy array
            # Update the progress bar
            pbar.update(len(batch))
    return embeddings

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

#### X's

#### saving ebm results

#### load

In [49]:
X1_emb_array = np.load('../project_data/X1_emb.npy')
X1_emb = X1_emb_array.tolist()

In [50]:
X2_emb_array = np.load('../project_data/X2_emb.npy')
X2_emb = X2_emb_array.tolist()

In [51]:
X3_emb_array = np.load('../project_data/X3_emb.npy')
X3_emb = X3_emb_array.tolist()

### Evaluating the embeddings

In [52]:
results_emb = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

In [53]:
combined_embeddings = np.concatenate([X1_emb, X2_emb, X3_emb], axis=1)

In [54]:
combined_embeddings.shape

(4686, 2304)

In [55]:
y_array = y.to_numpy()

for train_index, test_index in kf.split(combined_embeddings, y_array):
    X_train_, X_test_ = combined_embeddings[train_index], combined_embeddings[test_index]
    y_train_, y_test_ = y_array[train_index], y_array[test_index]
    
    # Train a logistic regression model
    model = LogisticRegression(max_iter = 1000)
    model.fit(X_train_, y_train_)
    
    # Predict on the test set
    y_pred = model.predict(X_test_)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test_, y_pred)
    precision = precision_score(y_test_, y_pred)
    recall = recall_score(y_test_, y_pred)
    f1 = f1_score(y_test_, y_pred)
    
    # Store evaluation metrics
    results_emb['accuracy'].append(accuracy)
    results_emb['precision'].append(precision)
    results_emb['recall'].append(recall)
    results_emb['f1'].append(f1)

In [56]:
print("Mean Accuracy:", round(np.mean(results_emb['accuracy']), 3))
print("Standard Deviation of Accuracy:", round(np.std(results_emb['accuracy']), 3))
print()
print("Mean Precision:", round(np.mean(results_emb['precision']), 3))
print("Standard Deviation of Precision:", round(np.std(results_emb['precision']), 3))
print()
print("Mean Recall:", round(np.mean(results_emb['recall']), 3))
print("Standard Deviation of Recall:", round(np.std(results_emb['recall']), 3))
print()
print("Mean F1 Score:", round(np.mean(results_emb['f1']), 3))
print("Standard Deviation of F1 Score:", round(np.std(results_emb['f1']), 3))

Mean Accuracy: 0.861
Standard Deviation of Accuracy: 0.007

Mean Precision: 0.751
Standard Deviation of Precision: 0.018

Mean Recall: 0.738
Standard Deviation of Recall: 0.011

Mean F1 Score: 0.744
Standard Deviation of F1 Score: 0.012


## LaBSE - sentence embedding
The LaBSE model from sentence-transformers is specifically optimized for creating high-quality sentence embeddings across multiple languages, and it includes all necessary preprocessing steps internally (padding and tokenization).

In [57]:
from sentence_transformers import SentenceTransformer

# Load LaBSE model
model = SentenceTransformer('sentence-transformers/LaBSE')

# Function to generate embeddings using LaBSE
def embedding(texts, batch_size=8):
    embeddings = []
    # Initialize tqdm to create the progress bar
    with tqdm(total=len(texts), desc="Embedding Progress") as pbar:
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_embeddings = model.encode(batch, show_progress_bar=False)
            embeddings.extend(batch_embeddings)
            # Update the progress bar
            pbar.update(len(batch))
    return embeddings

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

#### saving ebd results

#### load

In [58]:
X1_emb_array_labse = np.load('../project_data/X1_emb_array_labse.npy')
X1_emb_ = X1_emb_array_labse.tolist()

In [59]:
X2_emb_array_labse = np.load('../project_data/X2_emb_array_labse.npy')
X2_emb_ = X2_emb_array_labse.tolist()

In [60]:
X3_emb_array_labse = np.load('../project_data/X3_emb_array_labse.npy')
X3_emb_ = X3_emb_array_labse.tolist()

In [61]:
# Combine embeddings
combined_embeddings_labse = np.concatenate([X1_emb_, X2_emb_, X3_emb_], axis=1)

# Convert target to numpy array
y_array = y.to_numpy()

# Initialize results dictionary
results_emb = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Cross-validation
for train_index, test_index in kf.split(combined_embeddings_labse, y_array):
    X_train_, X_test_ = combined_embeddings_labse[train_index], combined_embeddings_labse[test_index]
    y_train_, y_test_ = y_array[train_index], y_array[test_index]
    
    # Train a logistic regression model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_, y_train_)
    
    # Predict on the test set
    y_pred = clf.predict(X_test_)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test_, y_pred)
    precision = precision_score(y_test_, y_pred)
    recall = recall_score(y_test_, y_pred)
    f1 = f1_score(y_test_, y_pred)
    
    # Store evaluation metrics
    results_emb['accuracy'].append(accuracy)
    results_emb['precision'].append(precision)
    results_emb['recall'].append(recall)
    results_emb['f1'].append(f1)

# Print results
print("Mean Accuracy:", round(np.mean(results_emb['accuracy']), 3))
print("Standard Deviation of Accuracy:", round(np.std(results_emb['accuracy']), 3))
print()
print("Mean Precision:", round(np.mean(results_emb['precision']), 3))
print("Standard Deviation of Precision:", round(np.std(results_emb['precision']), 3))
print()
print("Mean Recall:", round(np.mean(results_emb['recall']), 3))
print("Standard Deviation of Recall:", round(np.std(results_emb['recall']), 3))
print()
print("Mean F1 Score:", round(np.mean(results_emb['f1']), 3))
print("Standard Deviation of F1 Score:", round(np.std(results_emb['f1']), 3))

Mean Accuracy: 0.877
Standard Deviation of Accuracy: 0.007

Mean Precision: 0.753
Standard Deviation of Precision: 0.015

Mean Recall: 0.818
Standard Deviation of Recall: 0.02

Mean F1 Score: 0.784
Standard Deviation of F1 Score: 0.012


##### Save data to test on models

In [68]:
# Save the model
joblib.dump(X_combined_tfidf, '../project_data/X_combined_tfidf.joblib')
joblib.dump(y, '../project_data/y.joblib')

joblib.dump(combined_embeddings_labse, '../project_data/combined_embeddings_labse.joblib')
joblib.dump(y_array, '../project_data/y_array.joblib')

joblib.dump(X_train, '../project_data/X_train.joblib')
joblib.dump(X_val, '../project_data/X_val.joblib')
joblib.dump(y_train, '../project_data/y_train.joblib')
joblib.dump(y_val, '../project_data/y_val.joblib')

['../project_data/y_val.joblib']

### Logistic Regression - sklearn  (using TF-IDF)

This approach involves performing hyperparameter tuning using a randomized search with cross-validation for three different logistic regression models. Each logistic regression model has different hyperparameters defined in the hyperparameters list. Here's an explanation of each part:

**Models:** Three logistic regression models are defined with different settings for the solver parameter. This allows you to explore how different solvers perform with different hyperparameters.

**Hyperparameters:** For each logistic regression model, a dictionary of hyperparameters is defined. These hyperparameters will be tuned during the randomized search. Each dictionary contains different combinations of hyperparameters to explore.

**Cross-validation:** Stratified k-fold cross-validation is used to evaluate the performance of each model and hyperparameter combination. It ensures that each fold preserves the percentage of samples for each class. as we are using on all the project

**Randomized Search:** For each model, a RandomizedSearchCV object is created. It performs a randomized search over the hyperparameter space defined in the corresponding hyperparameters dictionary. The scoring parameter is set to 'f1', indicating that the F1 score will be used as the evaluation metric.

**Results:** After the randomized search is complete for each model, the best score and best parameters are recorded in a DataFrame named test_scores_lg. This DataFrame contains the performance of each model along with the corresponding best hyperparameters.

In [None]:
models = [
    LogisticRegression(class_weight='balanced', random_state=42, solver='saga'),
    LogisticRegression(class_weight='balanced', random_state=42, solver='liblinear'),
    LogisticRegression(class_weight='balanced', random_state=42, solver='saga')
]


hyperparameters = [
    {
        'model__penalty': ['l2'],
        'model__C': np.logspace(-2, 2, 50) 
    },
    {
        'model__penalty': ['l1', 'l2'],
        'model__C': np.logspace(-2, 2, 50)  
    },
    {
        'model__penalty': ['l1','elasticnet'],
        'model__C': np.logspace(-2, 2, 50),  
        'model__l1_ratio': [0.1, 0.5, 0.9]  
    }
]

# Initialize cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

results = []
cv_results_dfs = []

# Loop through each model and its hyperparameters
for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([('model', model)])

    # Perform random search
    rs = RandomizedSearchCV(pipeline, params, cv=stratified_kfold, n_iter=10, scoring='f1', verbose=10)
    rs.fit(X_combined_tfidf, y)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

# Create DataFrame with results
test_scores_lg_tfidf = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_lg_tfidf = test_scores_lg_tfidf.sort_values('best_score', ascending=False)

In [None]:
test_scores_lg_tfidf

In [None]:
print(test_scores_lg_tfidf['best_params'][1])
print(test_scores_lg_tfidf['best_score'][1])

In [None]:
print(test_scores_lg_tfidf['best_params'][2])
print(test_scores_lg_tfidf['best_score'][2])

In [None]:
print(test_scores_lg_tfidf['best_params'][0])
print(test_scores_lg_tfidf['best_score'][0])

### KNN - sklearn  (using TF-IDF)

In [None]:
nn_model = KNeighborsClassifier()

# Define the hyperparameters for randomized search
hyperparameters = {
    'n_neighbors': range(1, 31),  # Test range of neighbors from 1 to 20
    'weights': ['uniform', 'distance'],  # Test uniform and distance-based weights
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithms for computing neighbors
    'p': [1, 2]  # Parameter for distance metric (1 for Manhattan, 2 for Euclidean)
}

# Set up randomized search with cross-validation
rs = RandomizedSearchCV(knn_model, hyperparameters, cv=stratified_kfold, n_iter=40, scoring='f1', verbose=10)

# Perform the randomized search for best hyperparameters
rs.fit(X_combined_tfidf, y)

# Extract the results
best_params_knn = rs.best_params_
best_score_knn = rs.best_score_

In [None]:
# Display the best parameters and F1 score
print("Best parameters:", best_params_knn)
print("Best F1 Score:", best_score_knn)

### MLP - sklearn  (using TF-IDF)

In [None]:
models = [
    ('MLP', MLPClassifier(random_state=42))
]
  
hyperparameters = [
    {
        'MLP__hidden_layer_sizes': [(10,), (20,), (30,), (40,)],
        'MLP__activation': ['tanh', 'relu'],
        'MLP__solver': ['sgd', 'adam'],
        'MLP__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'MLP__alpha': np.logspace(-4, 4, 100),
        'MLP__learning_rate_init': [0.001,0.01],
        'MLP__max_iter': np.arange(100, 300),
        'MLP__batch_size': np.arange(50, 150)
    }
]

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []
cv_results_dfs = []

for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([model])
    
    rs = RandomizedSearchCV(pipeline, params, cv=stratified_kfold, n_iter=20, scoring='f1', verbose=10)
    rs.fit(X_combined_tfidf, y)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

test_scores_mlp_tfidf = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_mlp_tfidf = test_scores_mlp_tfidf.sort_values('best_score', ascending=False)

In [None]:
test_scores_mlp_tfidf

In [None]:
print(test_scores_mlp_tfidf['best_params'][0])
print(test_scores_mlp_tfidf['best_score'][0])

### XGBoost  (using  TF-IDF)

In [None]:
models = [
    ('XGBoost', XGBClassifier(random_state=19, objective='binary:logistic', eval_metric='logloss'))
]

hyperparameters = [
    {
        'XGBoost__n_estimators': [300, 350, 400],
        'XGBoost__max_depth': [15, 20, 25],
        'XGBoost__subsample': [1.0],
        'XGBoost__min_child_weight': [1, 5, 10],
        'XGBoost__learning_rate': [0.01, 0.1, 1.0]
    }
]

stratified_kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=19)

results = []
cv_results_dfs = []

for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([model])
    
    rs = RandomizedSearchCV(pipeline, params, n_iter= 30, cv=stratified_kfold, scoring='f1', verbose=10)
    rs.fit(X_combined_tfidf, y)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

test_scores_xgb = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_xgb = test_scores_xgb.sort_values('best_score', ascending=False)

In [None]:
test_scores_xgb

In [None]:
print(test_scores_xgb['best_params'][0])
print(test_scores_xgb['best_score'][0])

## Model testing with LaBSE embedings

### Logistic Regression - sklearn  (using labse embeddings)

This approach involves performing hyperparameter tuning using a randomized search with cross-validation for three different logistic regression models. Each logistic regression model has different hyperparameters defined in the hyperparameters list. Here's an explanation of each part:

**Models:** Three logistic regression models are defined with different settings for the solver parameter. This allows you to explore how different solvers perform with different hyperparameters.

**Hyperparameters:** For each logistic regression model, a dictionary of hyperparameters is defined. These hyperparameters will be tuned during the randomized search. Each dictionary contains different combinations of hyperparameters to explore.

**Cross-validation:** Stratified k-fold cross-validation is used to evaluate the performance of each model and hyperparameter combination. It ensures that each fold preserves the percentage of samples for each class. as we are using on all the project

**Randomized Search:** For each model, a RandomizedSearchCV object is created. It performs a randomized search over the hyperparameter space defined in the corresponding hyperparameters dictionary. The scoring parameter is set to 'f1', indicating that the F1 score will be used as the evaluation metric.

**Results:** After the randomized search is complete for each model, the best score and best parameters are recorded in a DataFrame named test_scores_lg. This DataFrame contains the performance of each model along with the corresponding best hyperparameters.

In [None]:
models = [
    LogisticRegression(class_weight='balanced', random_state=42, solver='saga'),
    LogisticRegression(class_weight='balanced', random_state=42, solver='liblinear'),
    LogisticRegression(class_weight='balanced', random_state=42, solver='saga')
]


hyperparameters = [
    {
        'model__penalty': ['l2'],
        'model__C': np.logspace(-2, 2, 50) 
    },
    {
        'model__penalty': ['l1', 'l2'],
        'model__C': np.logspace(-2, 2, 50)  
    },
    {
        'model__penalty': ['l1','elasticnet'],
        'model__C': np.logspace(-2, 2, 50),  
        'model__l1_ratio': [0.1, 0.5, 0.9]  
    }
]

# Initialize cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

results = []
cv_results_dfs = []

# Loop through each model and its hyperparameters
for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([('model', model)])

    # Perform random search
    rs = RandomizedSearchCV(pipeline, params, cv=stratified_kfold, n_iter=10, scoring='f1', verbose=10)
    rs.fit(combined_embeddings_labse, y_array)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

# Create DataFrame with results
test_scores_lg1 = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_lg1 = test_scores_lg1.sort_values('best_score', ascending=False)

In [None]:
test_scores_lg1

In [None]:
print(test_scores_lg1['best_params'][1])
print(test_scores_lg1['best_score'][1])

In [None]:
print(test_scores_lg1['best_params'][2])
print(test_scores_lg1['best_score'][2])

In [None]:
print(test_scores_lg1['best_params'][0])
print(test_scores_lg1['best_score'][0])

### KNN - sklearn  (using labse embeddings)

In [None]:
nn_model = KNeighborsClassifier()

# Define the hyperparameters for randomized search
hyperparameters = {
    'n_neighbors': range(1, 31),  # Test range of neighbors from 1 to 20
    'weights': ['uniform', 'distance'],  # Test uniform and distance-based weights
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithms for computing neighbors
    'p': [1, 2]  # Parameter for distance metric (1 for Manhattan, 2 for Euclidean)
}

# Set up randomized search with cross-validation
rs = RandomizedSearchCV(knn_model, hyperparameters, cv=stratified_kfold, n_iter=40, scoring='f1', verbose=10)

# Perform the randomized search for best hyperparameters
rs.fit(combined_embeddings_labse, y_array)

# Extract the results
best_params_knn = rs.best_params_
best_score_knn = rs.best_score_

In [None]:
# Display the best parameters and F1 score
print("Best parameters:", best_params_knn)
print("Best F1 Score:", best_score_knn)

### MLP - sklearn  (using labse embeddings)

**Approach 2:**

**Model:** Utilizes a scikit-learn MLPClassifier.\
**Hyperparameter Tuning:** Performs hyperparameter tuning using Randomized Search Cross Validation (RandomizedSearchCV).\
**Hyperparameters:** Searches through a predefined set of hyperparameters for the MLPClassifier using a randomized search strategy.\
**Evaluation:** Collects and analyzes the results of hyperparameter tuning, including the best score and best parameters found during the search.

In [None]:
models = [
    ('MLP', MLPClassifier(random_state=42))
]
  
hyperparameters = [
    {
        'MLP__hidden_layer_sizes': [(10,), (20,), (30,), (40,)],
        'MLP__activation': ['tanh', 'relu'],
        'MLP__solver': ['sgd', 'adam'],
        'MLP__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'MLP__alpha': np.logspace(-4, 4, 100),
        'MLP__learning_rate_init': [0.001,0.01],
        'MLP__max_iter': np.arange(100, 300),
        'MLP__batch_size': np.arange(50, 150)
    }
]

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []
cv_results_dfs = []

for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([model])
    
    rs = RandomizedSearchCV(pipeline, params, cv=stratified_kfold, n_iter=10, scoring='f1', verbose=10)
    rs.fit(combined_embeddings_labse, y_array)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

test_scores_mlp = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_mlp = test_scores_mlp.sort_values('best_score', ascending=False)

In [None]:
test_scores_mlp

In [None]:
print(test_scores_mlp['best_params'][0])
print(test_scores_mlp['best_score'][0])

### XGBoost  (using LaBSE embeddings)

In [None]:
models = [
    ('XGBoost', XGBClassifier(random_state=19, objective='binary:logistic', eval_metric='logloss'))
]

hyperparameters = [
    {
        'XGBoost__n_estimators': [360, 370, 380],
        'XGBoost__max_depth': [15, 17, 20],
        'XGBoost__subsample': [1.0],
        'XGBoost__min_child_weight': [1, 5, 10],
        'XGBoost__learning_rate': [0.01, 0.1, 1.0]
    }
]

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

results = []
cv_results_dfs = []

for model, params in zip(models, hyperparameters):
    pipeline = ImbPipeline([model])
    
    rs = GridSearchCV(pipeline, params, cv=stratified_kfold, scoring='f1', verbose=10)
    rs.fit(combined_embeddings_labse, y_array)
    
    results.append([rs.best_score_, rs.best_params_])
    cv_results_df = pd.DataFrame(rs.cv_results_)
    cv_results_dfs.append(cv_results_df)

test_scores_xgb = pd.DataFrame(results, columns=['best_score', 'best_params'])
test_scores_xgb = test_scores_xgb.sort_values('best_score', ascending=False)

In [None]:
test_scores_xgb

In [None]:
print(test_scores_xgb['best_params'][0])
print(test_scores_xgb['best_score'][0])

## DistilBERT - Transformer (for model testing) 

In [None]:
y_train = y_train.reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)

In [None]:
def prepare_data(X, y):
    data = pd.DataFrame({
        'text': X['cleaned_comments'] + ' [SEP] ' + X['cleaned_host'] + ' [SEP] ' + X['cleaned_description'],
        'label': y
    })
    return data

In [None]:
train_data = prepare_data(X_train, y_train)
val_data = prepare_data(X_val, y_val)

In [None]:
class BinaryClassificationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data['text'][index])
        label = self.data['label'][index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

max_length = 512  # Increase max_length to 512

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,  # Increase epochs to 8
    per_device_train_batch_size=16,  # Adjust batch size
    per_device_eval_batch_size=16,  # Adjust batch size
    warmup_steps=1000,  # Increase warm-up steps
    weight_decay=0.001,  # Adjust weight decay
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
)

# Define compute_metrics function (same as before)

# Initialize Trainer
trainer = Trainer(
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate on validation set
results = trainer.evaluate()

print("Validation Results:")
for key, value in results.items():
    print(f"{key}: {value}")

In [None]:
# Predictions on validation set
val_predictions = trainer.predict(val_dataset)
val_y_true = val_predictions.label_ids
val_y_pred = val_predictions.predictions.argmax(axis=-1)

In [None]:
# Classification report for validation set
print(classification_report(val_y_true, val_y_pred))

In [None]:
# Compute ROC curve for the transformer
fpr_transformer, tpr_transformer, _ = roc_curve(val_y_true, val_predictions.predictions[:, 1])
roc_auc_transformer = auc(fpr_transformer, tpr_transformer)

In [None]:
f1_distilbert = f1_score(val_y_true, val_y_pred)
accuracy_distilbert = accuracy_score(val_y_true, val_y_pred)
roc_distilbert = roc_auc_score(val_y_true, val_y_pred)
precision_distilbert = precision_score(val_y_true, val_y_pred)
recall_distilbert = recall_score(val_y_true, val_y_pred)

In [None]:
# Fit vectorizers on the training data
tfidf_vectorizer1 = TfidfVectorizer().fit(X_train['cleaned_comments'])
tfidf_vectorizer2 = TfidfVectorizer().fit(X_train['cleaned_host'])
tfidf_vectorizer3 = TfidfVectorizer().fit(X_train['cleaned_description'])

# Transform the training data
X1_tfidf_train = tfidf_vectorizer1.transform(X_train['cleaned_comments'])
X2_tfidf_train = tfidf_vectorizer2.transform(X_train['cleaned_host'])
X3_tfidf_train = tfidf_vectorizer3.transform(X_train['cleaned_description'])

# Combine TF-IDF features for the training data
X_combined_tfidf_train = hstack([X1_tfidf_train, X2_tfidf_train, X3_tfidf_train])

# Transform the validation data using the same vectorizers
X1_tfidf_val = tfidf_vectorizer1.transform(X_val['cleaned_comments'])
X2_tfidf_val = tfidf_vectorizer2.transform(X_val['cleaned_host'])
X3_tfidf_val = tfidf_vectorizer3.transform(X_val['cleaned_description'])

# Combine TF-IDF features for the validation data
X_combined_tfidf_val = hstack([X1_tfidf_val, X2_tfidf_val, X3_tfidf_val])

# Ensure that the number of features match
print(X_combined_tfidf_train.shape)  # Should output: (4686, 162797)
print(X_combined_tfidf_val.shape)

## Classification Report

In [None]:
from sklearn.metrics import confusion_matrix
def final_metrics(y_data, y_pred):
    print('___________________________________________________________________________________________________________')
    print('                              Classification Report & Confusion Matrix                                     ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_data, y_pred))
    print(confusion_matrix(y_data, y_pred))

### Logistic Regression

In [None]:
lg_model = LogisticRegression(C=8.68511373751352, penalty='l2', solver='saga', random_state=42)

lg_model.fit(X_combined_tfidf_train, y_train)

y_train_pred = lg_model.predict(X_combined_tfidf_train)
y_val_pred = lg_model.predict(X_combined_tfidf_val)


train_f1_score = f1_score(y_train, y_train_pred)
test_f1_score = f1_score(y_val, y_val_pred)

print("Training F1 Score:", train_f1_score)
print("Testing F1 Score:", test_f1_score)

In [None]:
# Classification report for training data
print("Classification Report for Training Data:")
print(final_metrics(y_train, y_train_pred))

In [None]:
# Classification report for validation data
print("Classification Report for Validation Data:")
print(final_metrics(y_val, y_val_pred))

In [None]:
f1_logit = f1_score(y_val, y_val_pred)
accuracy_logit = accuracy_score(y_val, y_val_pred)
roc_logit = roc_auc_score(y_val, y_val_pred)
precision_logit = precision_score(y_val, y_val_pred)
recall_logit = recall_score(y_val, y_val_pred)

### KNN Classifier

In [None]:
knn_model = KNeighborsClassifier(weights= 'distance', p=2, n_neighbors= 25, algorithm= 'auto')

knn_model.fit(X_combined_tfidf_train, y_train)

y_train_pred = knn_model.predict(X_combined_tfidf_train)
y_val_pred = knn_model.predict(X_combined_tfidf_val)


train_f1_score = f1_score(y_train, y_train_pred)
test_f1_score = f1_score(y_val, y_val_pred)

print("Training F1 Score:", train_f1_score)
print("Testing F1 Score:", test_f1_score)

In [None]:
# Classification report for training data
print("Classification Report for Training Data:")
print(final_metrics(y_train, y_train_pred))

In [None]:
# Classification report for validation data
print("Classification Report for Validation Data:")
print(final_metrics(y_val, y_val_pred))

In [None]:
f1_knn = f1_score(y_val, y_val_pred)
accuracy_knn = accuracy_score(y_val, y_val_pred)
roc_knn = roc_auc_score(y_val, y_val_pred)
precision_knn = precision_score(y_val, y_val_pred)
recall_knn = recall_score(y_val, y_val_pred)

### MLP Classifier

In [None]:
mlp_model = MLPClassifier(max_iter= 250, learning_rate_init=0.01, solver='sgd', learning_rate= 'adaptive', hidden_layer_sizes=(30,), batch_size=143, alpha=0.3593813663804629, activation='tanh', random_state=42)

mlp_model.fit(X_combined_tfidf_train, y_train)

y_train_pred = mlp_model.predict(X_combined_tfidf_train)
y_val_pred = mlp_model.predict(X_combined_tfidf_val)


train_f1_score = f1_score(y_train, y_train_pred)
test_f1_score = f1_score(y_val, y_val_pred)

print("Training F1 Score:", train_f1_score)
print("Testing F1 Score:", test_f1_score)

In [None]:
# Classification report for training data
print("Classification Report for Training Data:")
print(final_metrics(y_train, y_train_pred))

In [None]:
# Classification report for validation data
print("Classification Report for Validation Data:")
print(final_metrics(y_val, y_val_pred))

In [None]:
f1_mlp = f1_score(y_val, y_val_pred)
accuracy_mlp = accuracy_score(y_val, y_val_pred)
roc_mlp = roc_auc_score(y_val, y_val_pred)
precision_mlp = precision_score(y_val, y_val_pred)
recall_mlp = recall_score(y_val, y_val_pred)

### XGBoost Classifier

In [None]:
xgboost_model = XGBClassifier(random_state=19, objective='binary:logistic', eval_metric='logloss',subsample = 1.0, n_estimators=350, min_child_weight=5, max_depth=15, learning_rate=0.01)

xgboost_model.fit(X_combined_tfidf_train, y_train)

y_train_pred = xgboost_model.predict(X_combined_tfidf_train)
y_val_pred = xgboost_model.predict(X_combined_tfidf_val)


train_f1_score = f1_score(y_train, y_train_pred)
test_f1_score = f1_score(y_val, y_val_pred)

print("Training F1 Score:", train_f1_score)
print("Testing F1 Score:", test_f1_score)

In [None]:
# Classification report for training data
print("Classification Report for Training Data:")
print(final_metrics(y_train, y_train_pred))

In [None]:
# Classification report for validation data
print("Classification Report for Validation Data:")
print(final_metrics(y_val, y_val_pred))

In [None]:
f1_xgboost = f1_score(y_val, y_val_pred)
accuracy_xgboost = accuracy_score(y_val, y_val_pred)
roc_xgboost = roc_auc_score(y_val, y_val_pred)
precision_xgboost = precision_score(y_val, y_val_pred)
recall_xgboost = recall_score(y_val, y_val_pred)

## RUC curve

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(C=8.68511373751352, penalty='l2', solver='saga', random_state=42)),
    ('MLP Classifier', MLPClassifier(max_iter= 250, learning_rate_init=0.01, solver='sgd', learning_rate= 'adaptive', hidden_layer_sizes=(30,), batch_size=143, alpha=0.3593813663804629, activation='tanh', random_state=42)), 
    ('XGBoost',XGBClassifier(random_state=19, objective='binary:logistic', eval_metric='logloss',subsample = 1.0, n_estimators=350, min_child_weight=5, max_depth=15, learning_rate=0.01)),
    ('KNN Classifier', KNeighborsClassifier(weights= 'distance', p=2, n_neighbors= 25, algorithm= 'auto')), 
]

plt.figure(figsize=(10, 6))

plt.plot(fpr_transformer, tpr_transformer, lw=2, label='Transformer ROC (area = %0.2f)' % roc_auc_transformer)

for name, model in models:
    pipeline = ImbPipeline([('model', model)])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

    y_probas = cross_val_predict(pipeline, X_combined_tfidf_val, y_val, cv=stratified_kfold, method='predict_proba')
    fpr, tpr, thresholds = roc_curve(y_val, y_probas[:, 1])
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw=2, label='%s ROC (area = %0.2f)' % (name, roc_auc))

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC')
plt.legend(loc="lower right")
plt.show()

### Table with  model's resuslts

In [None]:
models = pd.DataFrame({'Model': ['Logistic Regression', 'KNN Classifier', 'MLP Classifier', 'XGBoost Classifier', 'DistilBERT'],
                       'F1 Score - validation': [f1_logit, f1_knn, f1_mlp, f1_xgboost, f1_distilbert],
                       'Accuracy Score - validation': [accuracy_logit, accuracy_knn, accuracy_mlp, accuracy_xgboost, accuracy_distilbert],
                       'ROC-AUC Score - validation': [roc_logit, roc_knn, roc_mlp, roc_xgboost, roc_distilbert],
                       'Precision Score - validation': [precision_logit, precision_knn, precision_mlp, precision_xgboost, precision_distilbert],
                       'Recall Score - validation': [recall_logit, recall_knn, recall_mlp, recall_xgboost, recall_distilbert]})
models.sort_values(by='F1 Score - validation', ascending=False)

In [None]:
# Transform the Test data using the same vectorizers
X1_tfidf_test = tfidf_vectorizer1.transform(X1_test)
X2_tfidf_test = tfidf_vectorizer2.transform(X2_test)
X3_tfidf_test = tfidf_vectorizer3.transform(X3_test)

# Combine TF-IDF features for the validation data
X_combined_tfidf_test = hstack([X1_tfidf_test, X2_tfidf_test, X3_tfidf_test])

# Ensure that the number of features match
print(X_combined_tfidf_test.shape)  # Should output: (4686, 162797)

In [None]:
test_pred_array = mlp_model.predict(X_combined_tfidf_val)

In [None]:
test_pred = pd.Series(test_pred_array)
test_pred.to_csv('Test_predictions.csv', index=False)

In [None]:
test_pred.value_counts()