# Experiment 6: TF-IDF + DOC2VEC

In [5]:
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

from src.fake_news_detector.core.encoders import tfidf_helpers as tf 
from src.fake_news_detector.core.classificators import SupportVectorMachine as svm_controller

## 1. Import `dataset_content.json`

In [6]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_content.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [7]:
def join_lists(dataset, word_lists):
    result = []
    for _, row in dataset.iterrows():
        text_join = ""
        for feature in word_lists:
            doc_list = row[feature]
            text_join += ' '.join(doc_list)
        result.append(text_join)
    return result  

In [8]:
dataset = pd.DataFrame()
dataset['text'] = join_lists(df, ['all_word'])
dataset['label'] = df['fake']*1
dataset.head()

Unnamed: 0,text,label
0,find corpse vegetarian restaurant Bangkok find...,1
1,switzerland warn authorize extradition politic...,1
2,navarre censor Songs Amaral Shakira song Madma...,1
3,woman pretend blind years greet people Now tru...,1
4,arrested ejaculate boss coffee last four years...,1


## 2. Split datasets

In [9]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

## 3. Create vocabulary

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_values = cv.fit_transform(dataset['text'].values)
cv_values.shape

(137, 6713)

In [11]:
# 1. Split datasets in real and false
df_train_real = df_train.loc[df_train['label'] == 0]
df_train_fake = df_train.loc[df_train['label'] == 1]

In [12]:
# 2. Transform each group from text to vocabulary
cv_train_real = cv.transform(df_train_real['text'])
cv_train_fake = cv.transform(df_train_fake['text'])
print(cv_train_real.shape,cv_train_fake.shape)

(52, 6713) (57, 6713)


## 4. Create TF-IDF models

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
# 3. Create models for each
tfidf_model_real = TfidfTransformer(use_idf=True).fit(cv_train_real)
tfidf_model_fake = TfidfTransformer(use_idf=True).fit(cv_train_fake)

### 4.1 Fake news relevant words
We want to get the top relevant words of Fake News documents by the TF-IDF create, so we compute from fake news text to tfidf weights.

In [15]:
topn = 600

In [16]:
tf_words_fake_train = tfidf_model_fake.transform(cv_train_fake)
results_fake, top_fake_words = tf.get_topn_relevant_words(cv, tf_words_fake_train, topn)

In [17]:
results_fake

{'rice': 0.689,
 'cheese': 0.637,
 'dog': 0.624,
 'restaurant': 0.5,
 'ikea': 0.5,
 'switzerland': 0.481,
 'crocodiles': 0.48,
 'sexual': 0.458,
 'foreign': 0.442,
 'songs': 0.438,
 'cor': 0.435,
 'wax': 0.428,
 'guitar': 0.424,
 'day': 0.155,
 'water': 0.152,
 'beach': 0.41,
 'echenique': 0.41,
 'libya': 0.409,
 'semen': 0.405,
 'store': 0.4,
 'prisoners': 0.379,
 'alejandro': 0.395,
 'blind': 0.39,
 'museum': 0.385,
 'pastor': 0.384,
 'cold': 0.383,
 'women': 0.202,
 'drivers': 0.192,
 'use': 0.149,
 'military': 0.373,
 'crush': 0.371,
 'feminist': 0.37,
 'ugly': 0.368,
 'pp': 0.366,
 'guindos': 0.36,
 'melilla': 0.359,
 'police': 0.153,
 'forest': 0.355,
 'minimum': 0.354,
 'control': 0.206,
 'vallecas': 0.346,
 'office': 0.344,
 'abedi': 0.341,
 'children': 0.161,
 'feminism': 0.34,
 'cents': 0.337,
 'airline': 0.337,
 'muslim': 0.324,
 'que': 0.163,
 'wage': 0.322,
 'madrid': 0.14,
 'families': 0.321,
 'extradition': 0.321,
 'flag': 0.314,
 'vagina': 0.314,
 'parent': 0.314,
 'ger

### 4.2 Real news relevant words
It needs to do the same process but with real news articles.

In [18]:
tf_words_real_train = tfidf_model_real.transform(cv_train_real)
results_real, top_real_words = tf.get_topn_relevant_words(cv, tf_words_real_train, topn)

In [37]:
top_real_words

['bitcoin',
 'mw',
 'columbus',
 'attack',
 'purchase',
 'bbva',
 'maroto',
 'vox',
 'education',
 'bettong',
 'degree',
 'valeria',
 'frb',
 'burst',
 'passengers',
 'bbva',
 'calviño',
 'manicure',
 'listen',
 'operations',
 'sesame',
 'netflix',
 'price',
 'catalonia',
 'borrell',
 'waistcoats',
 'liceu',
 'conservatory',
 'statue',
 'bet',
 'liter',
 'abascal',
 'choke',
 'congress',
 'death',
 'survivors',
 'political',
 'motion',
 'washington',
 'panda',
 'giant',
 'syria',
 'ambulances',
 'cuba',
 'movistar',
 'workers',
 'yellow',
 'authority',
 'january',
 'unemployment',
 'artists',
 'disinformation',
 'hathaway',
 'moreno',
 'xiaomi',
 'ikea',
 'introduce',
 'plaque',
 'quer',
 'wilson',
 'abuse',
 'sport',
 'coward',
 'payton',
 'fox',
 'refugees',
 'foreigners',
 'ryanair',
 'aviation',
 'radio',
 'fine',
 'private',
 'patrol',
 'gibraltar',
 'wind',
 'offshore',
 'zarzuela',
 'employment',
 'euros',
 'elections',
 'hanukkah',
 'law',
 'health',
 'ot',
 'pp',
 'crime',
 'f

## 5 Create Doc2Vec model and compute similarity

In [71]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial

def tag_docs(docs, col):
    tagged = docs.apply(lambda r: TaggedDocument(words=r[col], tags=[r.label]), axis=1)
    return tagged

In [72]:
total_docs = tag_docs(dataset, 'text')

In [110]:
model = Doc2Vec(total_docs, dm = 0, min_count=3, window=10, size=50, sample=1e-4, negative=10)

In [111]:
vec_real = model.infer_vector(top_real_words)
vec_fake = model.infer_vector(top_fake_words)

### 5.1 Calculate similarity between documents

In [114]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [115]:
df_train_tagged, df_test_tagged = train_test_split(total_docs, test_size=0.2, random_state=42) # Using the same seed

In [121]:
Y_train, train_tagged = get_vectors(model, df_train_tagged)
Y_test, test_tagged = get_vectors(model, df_test_tagged)

In [117]:
train_similarity = []
for element in train_tagged:
    similairty_real = spatial.distance.cosine(element, vec_real)
    similairty_fake = spatial.distance.cosine(element, vec_fake)
    train_similarity.append([similairty_real,similairty_fake])

In [118]:
test_similarity = []
for element in test_tagged:
    similairty_real = spatial.distance.cosine(element, vec_real)
    similairty_fake = spatial.distance.cosine(element, vec_fake)
    test_similarity.append([similairty_real,similairty_fake])

## 5 Create Doc2Vec model and compute similarity

In [125]:
from src.fake_news_detector.core.classificators import SupportVectorMachine as svm_controller
from sklearn import svm

In [123]:
X_train = train_similarity
X_test = test_similarity
rbf_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'rbf')
linear_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'linear')
poly_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'poly')
sigmoid_values = svm_controller.svc_param_selection(X_train, Y_train, 2, 'sigmoid')

In [126]:
models = {}
models['rbf'] = svm.SVC(kernel='rbf', C= rbf_values['C'], gamma=rbf_values['gamma'])
models['linear']  = svm.SVC(kernel='linear', C= linear_values['C'], gamma=linear_values['gamma'])
models['poly']  = svm.SVC(kernel='poly',C=poly_values['C'], gamma=poly_values['gamma'])
models['sigmoid'] = svm.SVC(kernel='sigmoid', C=sigmoid_values['C'], gamma=sigmoid_values['gamma'])

In [127]:
scores = svm_controller.run_models(models, X_train, Y_train, X_test, Y_test)
for model in scores:
    print('For model', model)
    print('Training score: {}. Test score: {}'.format(scores[model]['train'],scores[model]['test']))

For model rbf
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model linear
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model poly
Training score: 0.5229357798165137. Test score: 0.4642857142857143
For model sigmoid
Training score: 0.5229357798165137. Test score: 0.4642857142857143


In [131]:
X_train[:10]

[[1.0438207276165485, 0.7886397689580917],
 [1.0439171642065048, 0.79187972843647],
 [1.0436461009085178, 0.7890965044498444],
 [1.0415050648152828, 0.7904908806085587],
 [1.0426413342356682, 0.7959313094615936],
 [1.0471894592046738, 0.7938645333051682],
 [1.0498887673020363, 0.790956661105156],
 [1.0422907620668411, 0.7843248248100281],
 [1.0486450716853142, 0.7905933409929276],
 [1.0463269129395485, 0.787962332367897]]