# Experiment 2: BOW/TFIDF

In [1]:
# Load for Jupyter Notebook
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

from src.fake_news_detector.core.encoders import bow as b
from src.fake_news_detector.core.encoders import tfidf as t

## 1. Import `dataset_raw.json`

In [2]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_content.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [14]:
# Get useful info from our dataset:
dataset = pd.DataFrame()
list_f = ['all_joined', 'positive_words', 'negative_words', 'adjective_words', 'noun_phrases_words']
dataset = df[list_f]
dataset['label'] = df['fake']*1
dataset.head()

Unnamed: 0,all_joined,positive_words,negative_words,adjective_words,noun_phrases_words,label
0,The Thai police have clarified to the middle A...,[like],"[corpse, kill, fire, complain, deny, victim, v...","[vegetarian, international, human, middle, ope...","[corpse, vegetarian restaurant, Bangkok, find,...",1
1,"The Swiss government has said Tuesday that ""a ...","[Justice, like, Supreme, fair, perfectly]","[no, stress, offences, offences, accuse, argue...","[political, Swiss, legal, eventual, underline,...","[switzerland warn, extradition, political crim...",1
2,"The Government of Navarra, within the Skolae p...","[promote, great, hope, promote, like, affectio...","[censor, violence, ban, bad, fight, shit]","[navarre, educational, several, great, last, f...","[navarre censor Songs, Amaral Shakira, song, M...",1
3,Carmen Jiménez told her family and friends 28 ...,"[greet, truth, friends, truth, greet, justice]","[blind, blind, injury, blind, avoid]","[Spanish, i, social, pose, many, hard, whole, ...","[woman pretend, years, people, truth, Carmen J...",1
4,"Lewis Williams, a worker at an engineering fir...","[United, strong, superior, proud]","[arrested, detain, arrest, prison, abuse]","[last, past, despicable, strong, superior, ass...","[ejaculate, boss coffee, years, action, discus...",1


## 2. Dictionary creation and word vectorization

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [18]:
def join_lists(dataset, word_lists):
    result = []
    for _, row in dataset.iterrows():
        text_join = ""
        for feature in word_lists:
            doc_list = row[feature]
            text_join += ' '.join(doc_list)
        result.append(text_join)
    return result  

### 2.1 TF-IDF for all text

In [19]:
# VARIABLES
X_train = df_train['all_joined'].values
Y_train = df_train['label'].values
X_test = df_test['all_joined'].values
Y_test = df_test['label'].values

In [20]:
cv = CountVectorizer()
X_train_counts = cv.fit_transform(X_train)
X_train_counts.shape

(90, 6505)

In [21]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(90, 6505)

### 2.2 TF-IDF for sentiment features

In [22]:
X_train_sent = join_lists(df_train, ['positive_words', 'negative_words'])
Y_train = df_train['label'].values
X_test_sent = join_lists(df_test, ['positive_words', 'negative_words'])
Y_test = df_test['label'].values

In [23]:
cv_sent = CountVectorizer()
X_train_counts_sent = cv_sent.fit_transform(X_train_sent)
X_train_counts_sent.shape

(90, 648)

In [24]:
tf_transformer_sent = TfidfTransformer(use_idf=True).fit(X_train_counts_sent)
X_train_tf_sent = tf_transformer_sent.transform(X_train_counts_sent)
X_train_tf_sent.shape

(90, 648)

### 2.3 TF-IDF withoun conjuntion and preposition words

In [25]:
label_list = ['adjective_words', 'noun_phrases_words']
X_train_word = join_lists(df_train, label_list)
Y_train = df_train['label'].values
X_test_word = join_lists(df_test, label_list)
Y_test = df_test['label'].values

In [26]:
cv_word = CountVectorizer()
X_train_counts_word = cv_word.fit_transform(X_train_sent)
X_train_counts_word.shape

(90, 648)

In [45]:
tf_transformer_word = TfidfTransformer(use_idf=True).fit(X_train_counts_word)
X_train_tf_word = tf_transformer_word.transform(X_train_counts_word)
X_train_tf_word.shape

(90, 648)

## 3. Classification

In [28]:
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from src.fake_news_detector.core.classificators import SupportVectorMachine as s
from src.fake_news_detector.core.classificators import helpers

In [29]:
def svc_param_selection(X, y, nfolds, kernel):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel=kernel), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

### 3.1 TF-IDF all text

#### 3.1.1 Tranform test data

In [30]:
X_test_counts = cv.transform(X_test)
X_test_tf = tf_transformer.transform(X_test_counts)
X_test_tf.shape

(23, 6505)

#### 3.1.2 Search best parameters for SVC models

In [31]:
print('For rbf:', svc_param_selection(X_train_tf, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf, Y_train, 2, 'sigmoid'))

For rbf: {'C': 10, 'gamma': 0.1}
For linear: {'C': 10, 'gamma': 0.001}
For poly: {'C': 10, 'gamma': 1}
For sigmoid: {'C': 10, 'gamma': 1}


#### 3.1.3 Train models

In [35]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=0.1)
models['linear']  = svm.SVC(kernel='linear', C= 10, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 10, gamma=1)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=1)

In [36]:
scores = s.run_models(models, X_train_tf, Y_train, X_test_tf, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9888888888888889. Test score: 0.7142857142857143
For model linear
Training score: 0.9888888888888889. Test score: 0.7142857142857143
For model poly
Training score: 0.9888888888888889. Test score: 0.625
For model sigmoid
Training score: 0.9888888888888889. Test score: 0.6153846153846154


### 3.1 TF-IDF positive and negative

#### 3.1.1 Tranform test data

In [37]:
X_test_counts_sent = cv_sent.transform(X_test_sent)
X_test_tf_sent = tf_transformer_sent.transform(X_test_counts_sent)
X_test_tf_sent.shape

(23, 648)

#### 3.1.2 Search best parameters for SVC models

In [38]:
print('For rbf:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf_sent, Y_train, 2, 'sigmoid'))

For rbf: {'C': 10, 'gamma': 0.1}
For linear: {'C': 1, 'gamma': 0.001}
For poly: {'C': 0.001, 'gamma': 0.001}
For sigmoid: {'C': 10, 'gamma': 1}


In [40]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=0.1)
models['linear']  = svm.SVC(kernel='linear', C= 1, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 0.001, gamma=0.001)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=1)

In [41]:
scores = s.run_models(models, X_train_tf_sent, Y_train, X_test_tf_sent, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9888888888888889. Test score: 0.5384615384615384
For model linear
Training score: 0.9888888888888889. Test score: 0.5
For model poly
Training score: 0.5555555555555556. Test score: 0.5217391304347826
For model sigmoid
Training score: 0.9888888888888889. Test score: 0.5


In [43]:
X_test_counts_word = cv_word.transform(X_test_word)
X_test_tf_word = tf_transformer_word.transform(X_test_counts_word)
X_test_tf_word.shape

(23, 648)

In [46]:
print('For rbf:', svc_param_selection(X_train_tf_word, Y_train, 2, 'rbf'))
print('For linear:', svc_param_selection(X_train_tf_word, Y_train, 2, 'linear'))
print('For poly:', svc_param_selection(X_train_tf_word, Y_train, 2, 'poly'))
print('For sigmoid:', svc_param_selection(X_train_tf_word, Y_train, 2, 'sigmoid'))

For rbf: {'C': 10, 'gamma': 0.1}
For linear: {'C': 10, 'gamma': 0.001}
For poly: {'C': 0.001, 'gamma': 0.001}
For sigmoid: {'C': 10, 'gamma': 1}


In [47]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= 10, gamma=0.1)
models['linear']  = svm.SVC(kernel='linear', C= 1, gamma=0.001)
models['poly']  = svm.SVC(kernel='poly', C= 0.001, gamma=0.001)
models['sigmoid'] = svm.SVC(kernel='sigmoid', C= 10, gamma=1)

In [48]:
scores = s.run_models(models, X_train_tf_word, Y_train, X_test_tf_word, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.9888888888888889. Test score: 0.6153846153846154
For model linear
Training score: 0.9888888888888889. Test score: 0.5294117647058824
For model poly
Training score: 0.5555555555555556. Test score: 0.5217391304347826
For model sigmoid
Training score: 0.9888888888888889. Test score: 0.6666666666666666
