# Experiment 5: DOC2VEC

In [15]:
# Load for Jupyter Notebook
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

## Import `dataset_content.json`

In [16]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_content.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [17]:
def join_lists(dataset, word_lists):
    result = []
    for _, row in dataset.iterrows():
        text_join = ""
        for feature in word_lists:
            doc_list = row[feature]
            text_join += ' '.join(doc_list)
        result.append(text_join)
    return result  

In [27]:
dataset = pd.DataFrame()
dataset['text'] = df['all_word']
dataset['label'] = df['fake']*1
dataset.head()

Unnamed: 0,text,label
0,"[find, corpse, vegetarian, restaurant, Bangkok...",1
1,"[switzerland, warn, authorize, extradition, po...",1
2,"[navarre, censor, Songs, Amaral, Shakira, song...",1
3,"[woman, pretend, blind, years, greet, people, ...",1
4,"[arrested, ejaculate, boss, coffee, last, four...",1


## Split dataset for training and validation

In [28]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

In [37]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tqdm
train_tagged = df_train.apply(
    lambda r: TaggedDocument(words=r['text'], tags=[r.label]), axis=1)
test_tagged = df_test.apply(
    lambda r: TaggedDocument(words=r['text'], tags=[r.label]), axis=1)

## Create Doc2Vec model

In [40]:
model = Doc2Vec(dm = 1, min_count=1, window=10, size=150, sample=1e-4, negative=10)
model.build_vocab(train_tagged.values)

In [42]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [49]:
Y_train, X_train = vec_for_learning(model, train_tagged)
Y_test, X_test = vec_for_learning(model, test_tagged)

## Classification with SVM

In [50]:
from src.fake_news_detector.core.classificators import SupportVectorMachine as svm_controller
from sklearn import svm

In [51]:
rbf_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'rbf')
linear_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'linear')
poly_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'poly')
sigmoid_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'sigmoid')

In [52]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= rbf_values['C'], gamma=rbf_values['gamma'])
models['linear']  = svm.SVC(kernel='linear', C= linear_values['C'], gamma=linear_values['gamma'])
models['poly']  = svm.SVC(kernel='poly',C=poly_values['C'], gamma=poly_values['gamma'])
models['sigmoid'] = svm.SVC(kernel='sigmoid', C=sigmoid_values['C'], gamma=sigmoid_values['gamma'])

In [54]:
scores = svm_controller.run_models(models, X_train, Y_train, X_test, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model linear
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model poly
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model sigmoid
Training score: 0.5229357798165137. Test score: 0.4642857142857143


## Create Doc2Vec model version 2

In [72]:
model = Doc2Vec(dm = 0, min_count=3, window=10, size=150, sample=1e-4, negative=10)
model.build_vocab(train_tagged.values)

In [77]:
Y_train, X_train = vec_for_learning(model, train_tagged)
Y_test, X_test = vec_for_learning(model, test_tagged)

In [74]:
rbf_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'rbf')
linear_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'linear')
poly_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'poly')
sigmoid_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'sigmoid')

In [70]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= rbf_values['C'], gamma=rbf_values['gamma'])
models['linear']  = svm.SVC(kernel='linear', C= linear_values['C'], gamma=linear_values['gamma'])
models['poly']  = svm.SVC(kernel='poly',C=poly_values['C'], gamma=poly_values['gamma'])
models['sigmoid'] = svm.SVC(kernel='sigmoid', C=sigmoid_values['C'], gamma=sigmoid_values['gamma'])

In [71]:
scores = svm_controller.run_models(models, X_train, Y_train, X_test, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model linear
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model poly
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model sigmoid
Training score: 0.5229357798165137. Test score: 0.4642857142857143
