In [4]:
#Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


#Functionalities
from collections import Counter
import sys, os
import warnings
warnings.filterwarnings('ignore')

#NLP
import string
import re
import nltk

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV

#Metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_auc_score, roc_curve

# Custom Transformer
sys.path.append(os.path.abspath('..'))
from src.preprocess.preprocessor import TextPreprocessor
from src.models.train import train_model, optimize_model


# Models
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from src.models.metrics import evaluate_model 

import time

# Optimization
We will once again be loading our data in order to further optimize our models (LightGBM and SGD)

The decision for the best combination of hyperparameters will be taken via GridSearchCV (faster than RandomizedSearchCV)
Optimization will be performed using function $optimize_model$, which also saves the trained model into the $models$ folder

In [5]:
# load data
df = pd.read_csv('../data/processed/fake_or_real_news_clean.csv')

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(df['text_clean'], df['label'], test_size=0.2, random_state=0, stratify=df['label'])

## LightGBM
We are modifying the next hyperparameters:
- Ngram_range: whether we use onegram, bigram or trigram vectorization
- Min_df & max_df: whether or not we include words that appear too often or too little in our data for vectorization
- Learning rate
- Number of estimators (+Estimator +Model-complexity)
- Importance type: what algorithm should be used in order to determine tree divisions
- Number of leaves for each branch of our decision trees

In [4]:
# parameter combinations
param_grid = {
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1,3)],
    "vectorizer__min_df": [0.01, 0],
    "vectorizer__max_df": [0.99, 1],
    'clf__n_estimators' : [200, 300, 400], 
    'clf__importance_type' : ['split', 'gain'],
    'clf__num_leaves': [7, 14, 21, 28, 31, 50],
    'clf__learning_rate': [0.1, 0.03, 0.003]
}

optimize_model(clf=LGBMClassifier(), param_grid=param_grid, scoring='accuracy', cv=3, xtrain=xtrain, xtest=xtest, ytrain=ytrain, ytest=ytest)

Execution time: 22702.06s
ROC-AUC score of the model: 0.9792119268336176
Accuracy of the model: 0.9318541996830428

Classification report: 
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       631
           1       0.95      0.92      0.93       631

    accuracy                           0.93      1262
   macro avg       0.93      0.93      0.93      1262
weighted avg       0.93      0.93      0.93      1262


Confusion matrix: 
[[598  33]
 [ 53 578]]

------------------------------------------------------


# SGD
We are modifying the next hyperparameters:
- Ngram_range
- Min_df & max_df
- Loss normalization technique: whether we use l1 or l2 normalization
- Loss function

In [6]:
param_grid = {
    "vectorizer__min_df": [0.0],
    "vectorizer__max_df": [1.0],
    "vectorizer__stop_words": ['english'],
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1,3)],
    "vectorizer__norm": ["l1", "l2"],
    "clf__loss": ["hinge", "log"]
}

optimize_model(SGDClassifier(), param_grid=param_grid, scoring='accuracy', cv=3, xtrain=xtrain, xtest=xtest, ytrain=ytrain, ytest=ytest)

Predict_Proba not available
Execution time: 206.86s
Accuracy of the model: 0.9429477020602218

Classification report: 
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       631
           1       0.94      0.94      0.94       631

    accuracy                           0.94      1262
   macro avg       0.94      0.94      0.94      1262
weighted avg       0.94      0.94      0.94      1262


Confusion matrix: 
[[596  35]
 [ 37 594]]

------------------------------------------------------


# Conclusions
The most valuable algorithm when it comes to predictions is SGD. However, for the sake of entertainment and more interesting interpretability we are choosing the $LightGBM$ model to move past this stage.
Again, SGD can only do so much for us when it comes to interpretability due to it not allowing us to understand the way it outputs probabilities. (This is, $predict_proba$ can't be done)