In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import join, dirname
from lib.processor import preprocess_text
import gensim
from gensim.models.word2vec import Word2Vec
import nltk

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.model_selection import train_test_split


In [2]:
BASE_DIR = os.path.dirname(os.path.realpath('__file__'))

In [11]:
data = pd.read_csv(os.path.join(BASE_DIR, 'dataset.csv'), sep=',')
print(data.head())
data.text = data.text.map(lambda x: preprocess_text(x, True, True, False, True, True))


                                                text  label
0  In an interview with congressional investigato...      0
1  Getty - John Shearer / Staff \r\nComedian Patt...      1
2  By Jameson Parker Election 2016 , Politics Nov...      1
3   Last updated at 16:53 GMT A helmet for cyclis...      0
4  13 Herbal Teas With Highest Antioxidants http:...      1


In [12]:
print(data.head())

                                                text  label
0  interview congressional investigator diplomat ...      0
1  Getty John Shearer Staff Comedian Patton Oswal...      1
2  Jameson Parker Election 2016 Politics November...      1
3  update 1653 GMT helmet cyclist spot imminent c...      0
4  13 Herbal Teas Highest Antioxidants httpblogsn...      1


In [13]:
data.to_csv(os.path.join(BASE_DIR,'data','wj_processed.csv'), sep=';', encoding='utf-8', index=False)

In [14]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [15]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list ])

In [16]:
# Split and tokenize the dataset for w2v
#train, test = train_test_split(data, test_size=0.3, random_state = 42)

#test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
#train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
data_tokenized = data.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [17]:
#load pretrained model
wv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(BASE_DIR,"models","GoogleNews-vectors-negative300.bin.gz"), binary=True)
wv.init_sims(replace=True)

In [18]:
data_word_average = word_averaging_list(wv, data_tokenized)

In [22]:
print(data_word_average[:5])

[[ 0.0288958   0.04923387  0.03451601 ... -0.05211262  0.04874931
  -0.01187791]
 [ 0.03234296  0.0256858  -0.03558049 ... -0.05346137  0.0661214
  -0.01541387]
 [ 0.07398563  0.01908754 -0.00030894 ... -0.06885757  0.00210524
   0.00119012]
 [-0.00864953 -0.02571224 -0.00632877 ...  0.00580416  0.05334501
  -0.09583674]
 [-0.04196492  0.04734785 -0.02476608 ...  0.06400798  0.06183368
   0.06039234]]


In [24]:
data_word_average.shape

(15679, 300)

In [26]:
df = pd.DataFrame(data_word_average)

        0         1         2         3         4         5         6    \
0  0.028896  0.049234  0.034516  0.055213 -0.105407 -0.046933  0.025233   
1  0.032343  0.025686 -0.035580  0.104289 -0.042332 -0.003351  0.064108   
2  0.073986  0.019088 -0.000309  0.055446 -0.021151  0.014929  0.045064   
3 -0.008650 -0.025712 -0.006329 -0.010441 -0.097033 -0.047436  0.067023   
4 -0.041965  0.047348 -0.024766  0.090478 -0.041287  0.065566  0.045625   

        7         8         9      ...          290       291       292  \
0 -0.109444  0.141332  0.047805    ...    -0.037357 -0.022533 -0.076386   
1 -0.085843  0.125483  0.033254    ...    -0.032302  0.009351 -0.131091   
2 -0.007039  0.130749  0.065265    ...    -0.000070 -0.008963 -0.106510   
3 -0.086308  0.152578  0.020291    ...     0.006758 -0.013189 -0.129570   
4 -0.098067  0.058216  0.062641    ...     0.001415 -0.025904 -0.015243   

        293       294       295       296       297       298       299  
0  0.008189 -0.036483 -0

In [28]:
df.to_csv(os.path.join(BASE_DIR,'data','w2v_300_dim.csv'), sep=',', encoding='utf-8', index=False)

In [56]:
#X_train_word_average = word_averaging_list(wv,train_tokenized)
#X_test_word_average = word_averaging_list(wv,test_tokenized)


In [57]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=2,solver='lbfgs', C=1e5)
logreg = logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=['real','fake']))

accuracy 0.8548044217687075
              precision    recall  f1-score   support

        real       0.87      0.90      0.89      3005
        fake       0.82      0.77      0.79      1699

   micro avg       0.85      0.85      0.85      4704
   macro avg       0.85      0.84      0.84      4704
weighted avg       0.85      0.85      0.85      4704

