# Experiment One: Classify with `dataset_style.json`

In [1]:
#imports
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd
import numpy as np
from src.utils import io
from src.fake_news_detector.core.classificators import LDA, QDA

## Import style dataset

In [2]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_style.json')
df = pd.DataFrame(data=articles['articles'])

## Variables to classify

In [3]:
df.head()

Unnamed: 0,fake,mean_character_per_word,mean_noun_phrases,mean_words_per_sentence,n_quotes,n_sentences,n_words,pert_different_words,pert_total_adj,pert_total_conj_prep,pert_total_negative_words,pert_total_nouns,pert_total_positive_words,pert_total_verbs,sentiment,title_n_words,title_pert_total_conj_prep,title_pert_total_negative_words,title_pert_total_positive_words,title_sentiment
0,True,1.0,,4.521569,,7,255,0.486275,0.070588,0.011765,0.039216,0.356863,0.003922,0.160784,-0.417143,10,0.011765,0.1,0.0,-0.5719
1,True,1.0,,4.840456,,4,351,0.444444,0.082621,0.031339,0.019943,0.321937,0.014245,0.133903,-0.1177,12,0.031339,0.0,0.0,-0.1027
2,True,1.0,,4.285106,,11,235,0.553191,0.06383,0.017021,0.025532,0.310638,0.029787,0.148936,0.016064,14,0.017021,0.071429,0.0,-0.4588
3,True,1.0,,4.12,,5,125,0.6,0.088,0.04,0.04,0.272,0.048,0.24,-0.14866,12,0.04,0.083333,0.083333,-0.6202
4,True,1.0,,4.536232,,4,138,0.594203,0.072464,0.028986,0.036232,0.311594,0.028986,0.152174,-0.02335,12,0.028986,0.083333,0.0,-0.4767


Variables with more correlation:
    - mean_words_per_sentence
    - pert_total_nouns
    - pert_total_verbs
    - n_words
    - pert_different_words
    - title_n_words
    - title_sentiment

In [4]:
df_two = df[['pert_total_nouns','pert_total_verbs','n_words','pert_different_words','title_n_words','title_sentiment', 'fake']]
df_two['fake'] = df['fake']*1


In [5]:
df_two.head()

Unnamed: 0,pert_total_nouns,pert_total_verbs,n_words,pert_different_words,title_n_words,title_sentiment,fake
0,0.356863,0.160784,255,0.486275,10,-0.5719,1
1,0.321937,0.133903,351,0.444444,12,-0.1027,1
2,0.310638,0.148936,235,0.553191,14,-0.4588,1
3,0.272,0.24,125,0.6,12,-0.6202,1
4,0.311594,0.152174,138,0.594203,12,-0.4767,1


In [6]:
# TODO SPLIT DF
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_two, test_size=0.2, random_state=42)
df_train.head()

Unnamed: 0,pert_total_nouns,pert_total_verbs,n_words,pert_different_words,title_n_words,title_sentiment,fake
89,0.318584,0.134956,452,0.486726,6,0.0,0
26,0.290244,0.156098,410,0.429268,8,-0.7845,1
42,0.266949,0.182203,236,0.495763,10,-0.1027,1
70,0.379518,0.13253,166,0.596386,13,-0.128,0
15,0.310056,0.153631,358,0.502793,9,-0.6908,1


In [7]:
def do_test(model,df_train, df_test, labels_x, label_tag, scaler, type_m):
    print('For labels:', ', '.join(labels_x))
    if type_m == 'LDA':
        pred = LDA.run_performance(model,  df_train[labels_x], df_train[label_tag], df_test[labels_x], df_test[label_tag], scaler)
    else:
        pred = QDA.run_performance(model,  df_train[labels_x], df_train[label_tag], df_test[labels_x], df_test[label_tag], scaler)
    return pred
        
results = {}

### LDA

In [8]:
n_components = 1
scaler = True
lda_model = LDA.create_LDA_model(n_components) # Create
labels_x = ['pert_total_nouns', 'n_words',  'title_sentiment']
res = do_test(lda_model, df_train, df_test,labels_x, 'fake', scaler, 'LDA')

For labels: pert_total_nouns, n_words, title_sentiment
Confusion matrix:
[[ 6  2]
 [ 2 11]]
REPORT:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.85      0.85      0.85        13

   micro avg       0.81      0.81      0.81        21
   macro avg       0.80      0.80      0.80        21
weighted avg       0.81      0.81      0.81        21

Train precision score: 0.75
Test precision score: 0.8461538461538461


In [9]:
labels_x = ['pert_total_verbs', 'n_words','title_sentiment', 'pert_different_words']
res = do_test(lda_model, df_train, df_test,labels_x, 'fake', scaler, 'LDA')

For labels: pert_total_verbs, n_words, title_sentiment, pert_different_words
Confusion matrix:
[[ 4  4]
 [ 2 11]]
REPORT:
              precision    recall  f1-score   support

           0       0.67      0.50      0.57         8
           1       0.73      0.85      0.79        13

   micro avg       0.71      0.71      0.71        21
   macro avg       0.70      0.67      0.68        21
weighted avg       0.71      0.71      0.70        21

Train precision score: 0.7125
Test precision score: 0.7333333333333333


### QDA

In [10]:
scaler = True
lda_model = QDA.create_QDA_model() # Create
labels_x = ['pert_total_nouns', 'n_words',  'title_sentiment']
res = do_test(lda_model, df_train, df_test,labels_x, 'fake', scaler, 'QDA')

For labels: pert_total_nouns, n_words, title_sentiment
Confusion matrix:
[[ 6  2]
 [ 2 11]]
REPORT:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.85      0.85      0.85        13

   micro avg       0.81      0.81      0.81        21
   macro avg       0.80      0.80      0.80        21
weighted avg       0.81      0.81      0.81        21

Train precision score: 0.75
Test precision score: 0.8461538461538461
