# Experiment 2: BOW/TFIDF

In [1]:
# Load for Jupyter Notebook
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

from src.fake_news_detector.core.encoders import bow as b
from src.fake_news_detector.core.encoders import tfidf as t

## 1. Import `dataset_raw.json`

In [2]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_content.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [3]:
# Get useful info from our dataset:
dataset = pd.DataFrame()
list_f = ['all_word', 'title_word', 'subtitle_word', 'negative_words','positive_words', 'adjective_words', 'common_noun_words', 'verb_words']
dataset = df[list_f]
dataset['label'] = df['fake']*1
dataset.head()

Unnamed: 0,all_word,title_word,subtitle_word,negative_words,positive_words,adjective_words,common_noun_words,verb_words,label
0,"[find, corpse, vegetarian, restaurant, Bangkok...","[find, corpse, vegetarian, restaurant, Bangkok]","[find, make, speculate, international, media, ...","[corpse, kill, fire, complain, deny, victim, v...",[like],"[vegetarian, make, speculate, international, h...","[corpse, restaurant, Bangkok, media, establish...","[find, serve, clarify, kill, collect, complain...",1
1,"[switzerland, warn, authorize, extradition, po...","[switzerland, warn, authorize, extradition, po...","[spokesman, Swiss, government, clarify, case, ...","[stress, offences, offences, accuse, argue, pe...","[Justice, like, Supreme, fair, perfectly]","[extradition, political, Swiss, priori, legal,...","[switzerland, warn, crimes, A, spokesman, gove...","[authorize, analyze, say, seem, link, A, add, ...",1
2,"[navarre, censor, Songs, Amaral, Shakira, song...","[navarre, censor, Songs, Amaral, Shakira, song...","[may, use, school]","[censor, violence, ban, bad, fight, shit]","[promote, great, hope, promote, like, affectio...","[navarre, educational, decide, several, great,...","[censor, Songs, Amaral, Shakira, song, Madman,...","[use, find, song, give, hope, promote, center,...",1
3,"[woman, pretend, blind, years, greet, people, ...","[woman, pretend, blind, years, greet, people]","[truth, discover]","[blind, blind, injury, blind, avoid]","[greet, truth, friends, truth, greet, justice]","[greet, discover, Spanish, social, avoid, many...","[woman, years, people, Carmen, Jiménez, family...","[pretend, truth, tell, tire, see, stand, say, ...",1
4,"[arrested, ejaculate, boss, coffee, last, four...","[arrested, ejaculate, boss, coffee, last, four...","[recognize, action, discussion]","[arrested, detain, arrest, prison, abuse]","[United, strong, superior, proud]","[ejaculate, last, unveil, despicable, strong, ...","[boss, coffee, years, action, discussion, Lewi...","[arrested, recognize, detain, ejaculate, bring...",1


## 2. Dictionary creation and word vectorization

In [4]:
def join_lists(dataset, word_lists):
    result = []
    for _, row in dataset.iterrows():
        text_join = ""
        for feature in word_lists:
            doc_list = row[feature]
            text_join += ' '.join(doc_list)
        result.append(text_join)
    return result  

In [5]:
dataset['all_joined'] = join_lists(dataset, ['all_word'])
dataset['all_joined'].head()

0    find corpse vegetarian restaurant Bangkok find...
1    switzerland warn authorize extradition politic...
2    navarre censor Songs Amaral Shakira song Madma...
3    woman pretend blind years greet people Now tru...
4    arrested ejaculate boss coffee last four years...
Name: all_joined, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

### 2.1 TF-IDF for all text

In [7]:
# VARIABLES
X_train = df_train['all_joined'].values
Y_train = df_train['label'].values
X_test = df_test['all_joined'].values
Y_test = df_test['label'].values

In [8]:
cv = CountVectorizer()
X_train_counts = cv.fit_transform(X_train)
X_train_counts.shape

(109, 5817)

In [9]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(109, 5817)

### 2.2 TF-IDF for sentiment features

In [10]:
X_train_sent = join_lists(df_train, ['positive_words', 'negative_words'])
Y_train = df_train['label'].values
X_test_sent = join_lists(df_test, ['positive_words', 'negative_words'])
Y_test = df_test['label'].values

In [11]:
cv_sent = CountVectorizer()
X_train_counts_sent = cv_sent.fit_transform(X_train_sent)
X_train_counts_sent.shape

(109, 718)

In [12]:
tf_transformer_sent = TfidfTransformer(use_idf=True).fit(X_train_counts_sent)
X_train_tf_sent = tf_transformer_sent.transform(X_train_counts_sent)
X_train_tf_sent.shape

(109, 718)

### 2.3 TF-IDF withoun conjuntion and preposition words

In [13]:
label_list = ['adjective_words', 'common_noun_words', 'verb_words']
X_train_word = join_lists(df_train, label_list)
Y_train = df_train['label'].values
X_test_word = join_lists(df_test, label_list)
Y_test = df_test['label'].values

In [14]:
cv_word = CountVectorizer()
X_train_counts_word = cv_word.fit_transform(X_train_sent)
X_train_counts_word.shape

(109, 718)

In [15]:
tf_transformer_word = TfidfTransformer(use_idf=True).fit(X_train_counts_word)
X_train_tf_word = tf_transformer_word.transform(X_train_counts_word)
X_train_tf_word.shape

(109, 718)

### 2.3 TF-IDF title + subtitle

In [16]:
label_list = ['title_word', 'subtitle_word']
X_train_tnst = join_lists(df_train, label_list)
Y_train = df_train['label'].values
X_test_tnst = join_lists(df_test, label_list)
Y_test = df_test['label'].values

In [17]:
cv_tnst = CountVectorizer()
X_train_counts_tnst = cv_tnst.fit_transform(X_train_sent)
X_train_counts_tnst.shape

(109, 718)

In [18]:
tf_transformer_tnst = TfidfTransformer(use_idf=True).fit(X_train_counts_word)
X_train_tf_tnst = tf_transformer_tnst.transform(X_train_counts_tnst)
X_train_tf_tnst.shape

(109, 718)

## 3. Classification

In [19]:
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from src.fake_news_detector.core.classificators import SupportVectorMachine as s
from src.fake_news_detector.core.classificators import helpers

In [20]:
def svc_param_selection(X, y, nfolds, kernel):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel=kernel), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

### 3.1 TF-IDF all text

#### 3.1.1 Tranform test data

In [21]:
X_test_counts = cv.transform(X_test)
X_test_tf = tf_transformer.transform(X_test_counts)
X_test_tf.shape

(28, 5817)

#### 3.1.2 Search best parameters for SVC models

In [22]:
print('For rbf:', svc_param_selection(X_train_tf, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf, Y_train, 2, 'sigmoid'))



For rbf: {'C': 10, 'gamma': 0.1}




For linear: {'C': 10, 'gamma': 0.001}
For poly: {'C': 10, 'gamma': 1}
For sigmoid: {'C': 10, 'gamma': 1}




#### 3.1.3 Train models

In [23]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=0.1)
models['linear']  = svm.SVC(kernel='linear', C= 10, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 10, gamma=1)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=1)

In [24]:
scores = s.run_models(models, X_train_tf, Y_train, X_test_tf, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9908256880733946. Test score: 0.6875
For model linear
Training score: 0.9908256880733946. Test score: 0.6875
For model poly
Training score: 0.9908256880733946. Test score: 0.4642857142857143
For model sigmoid
Training score: 0.9908256880733946. Test score: 0.6875


### 3.1 TF-IDF positive and negative

#### 3.1.1 Tranform test data

In [25]:
X_test_counts_sent = cv_sent.transform(X_test_sent)
X_test_tf_sent = tf_transformer_sent.transform(X_test_counts_sent)
X_test_tf_sent.shape

(28, 718)

#### 3.1.2 Search best parameters for SVC models

In [26]:
print('For rbf:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'sigmoid'))



For rbf: {'C': 10, 'gamma': 1}




For linear: {'C': 1, 'gamma': 0.001}
For poly: {'C': 10, 'gamma': 1}
For sigmoid: {'C': 10, 'gamma': 0.1}


In [27]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=1)
models['linear']  = svm.SVC(kernel='linear', C= 1, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 10, gamma=1)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=0.1)

In [28]:
scores = s.run_models(models, X_train_tf_sent, Y_train, X_test_tf_sent, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9908256880733946. Test score: 0.5625
For model linear
Training score: 0.9908256880733946. Test score: 0.5294117647058824
For model poly
Training score: 0.9908256880733946. Test score: 0.48148148148148145
For model sigmoid
Training score: 0.9908256880733946. Test score: 0.5294117647058824


### 3.1 TF-IDF verbs +  adjectives + similarity


In [29]:
X_test_counts_word = cv_word.transform(X_test_word)
X_test_tf_word = tf_transformer_word.transform(X_test_counts_word)
X_test_tf_word.shape

(28, 718)

In [30]:
print('For rbf:', svc_param_selection(X_train_tf_word, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf_word, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf_word, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf_word, Y_train, 2, 'sigmoid'))



For rbf: {'C': 10, 'gamma': 1}




For linear: {'C': 1, 'gamma': 0.001}
For poly: {'C': 10, 'gamma': 1}
For sigmoid: {'C': 10, 'gamma': 0.1}


In [31]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=1)
models['linear']  = svm.SVC(kernel='linear', C= 1, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 10, gamma=1)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=0.1)

In [32]:
scores = s.run_models(models, X_train_tf_word, Y_train, X_test_tf_word, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9908256880733946. Test score: 0.5
For model linear
Training score: 0.9908256880733946. Test score: 0.5
For model poly
Training score: 0.9908256880733946. Test score: 0.4642857142857143
For model sigmoid
Training score: 0.9908256880733946. Test score: 0.5


### 3.1 TF-IDF title and subtitle

In [33]:
X_test_counts_tnst = cv_tnst.transform(X_test_tnst)
X_test_tf_tnst = tf_transformer_tnst.transform(X_test_counts_tnst)
X_test_tf_tnst.shape

(28, 718)

In [34]:
print('For rbf:', svc_param_selection(X_train_tf_tnst, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf_tnst, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf_tnst, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf_tnst, Y_train, 2, 'sigmoid'))



For rbf: {'C': 10, 'gamma': 1}




For linear: {'C': 1, 'gamma': 0.001}
For poly: {'C': 10, 'gamma': 1}
For sigmoid: {'C': 10, 'gamma': 0.1}


In [35]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=1)
models['linear']  = svm.SVC(kernel='linear', C= 1, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 10, gamma=1)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=0.1)

In [36]:
scores = s.run_models(models, X_train_tf_tnst, Y_train, X_test_tf_tnst, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9908256880733946. Test score: 0.5
For model linear
Training score: 0.9908256880733946. Test score: 0.5
For model poly
Training score: 0.9908256880733946. Test score: 0.48148148148148145
For model sigmoid
Training score: 0.9908256880733946. Test score: 0.5
