In [2]:
#Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


#Functionalities
from collections import Counter
import sys, os
import warnings
warnings.filterwarnings('ignore')

#NLP
import string
import re
import nltk


# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split

#Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Custom Transformer
#sys.path.append(os.path.abspath('..'))
#sys.path.append(os.path.abspath('../src'))

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier

import time

from catboost import CatBoostClassifier

sys.path.append(os.path.abspath('..'))
from src.preprocess.preprocessor import TextPreprocessor
from src.models.train import train_model, optimize_model
from src.models.metrics import evaluate_model

# Modelling

## Read data / Train-test split
Test size chosen is 20% of our dataset. We are stratifying the split in order to maintain target variable proportions 

In [3]:
# load data
df = pd.read_csv('../data/processed/fake_or_real_news_clean.csv')

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(df['text_clean'], df['label'], test_size=0.2, random_state=0, stratify=df['label'])

## Model options / Baseline
Given the high balance in our dataset, we will establish a baseline of 50% accuracy, 50% recall for our data. If any of our models score below these metrics we would never use them

Models chosen are:
- LightGBM: Fast Gradient Boosting
- CatboostClassifier: Similar to LightGBM and XGBoost
- RandomForestClassifier: Most common Bagging algorithm
- XGBoost
- Stochastic Gradient Descent

The pipeline that we are feeding the data through consists of:
- Text preprocessing & cleaning
- Tf-Idf vectorization
- Model-fitting

In [5]:
classifiers =[
    LGBMClassifier(),
    CatBoostClassifier(logging_level='Silent'),
    SGDClassifier(),
    RandomForestClassifier(),
    XGBClassifier()
    ]

We loop through the different classifiers applying the $train_model$ function which automatically fits the model, as well as displaying a basic table of results and saving the model to the specified route.

In [6]:
results = []
for classifier in classifiers:
    train_model(classifier, xtrain=xtrain, ytrain=ytrain, xtest=xtest, ytest=ytest, list=results)

Classifier: LGBMClassifier()
Execution time: 40.28s
ROC-AUC score of the model: 0.9763387172525686
Accuracy of the model: 0.919175911251981

Classification report: 
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       631
           1       0.93      0.90      0.92       631

    accuracy                           0.92      1262
   macro avg       0.92      0.92      0.92      1262
weighted avg       0.92      0.92      0.92      1262


Confusion matrix: 
[[590  41]
 [ 61 570]]

------------------------------------------------------
Classifier: <catboost.core.CatBoostClassifier object at 0x00000235EEA69A90>
Execution time: 438.39s
ROC-AUC score of the model: 0.975964496773918
Accuracy of the model: 0.9160063391442155

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       631
           1       0.93      0.90      0.91       631

    accuracy                    

### Comparison
Overall, SGD seems to have beat all the other algorithms. However, LightGBM has performed quite well accounting for runtime.
One bad characteristic of the SGDClassifier algorithm is that it is based on SVM techniques therefore not allowing us to evaluate how it manages individual probabilities, not giving us room to optimize the decision threshold.
Therefore, we will be optimizing those two in the next notebook.

In [7]:
pd.DataFrame(results, columns=['Model', 'Accuracy', 'Execution time'])

Unnamed: 0,Model,Accuracy,Execution time
0,LGBMClassifier,92.0%,40
1,CatBoostClassifier,92.0%,438
2,SGDClassifier,94.0%,22
3,RandomForestClassifier,90.0%,33
4,XGBClassifier,90.0%,35
