In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import join, dirname
from lib.processor import preprocess_text
import gensim
from gensim.models.word2vec import Word2Vec
import nltk

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.model_selection import train_test_split


In [7]:
BASE_DIR = os.path.dirname(os.path.realpath('__file__'))

C:\Users\admind\Documents\workspace


In [51]:
data = pd.read_csv(os.path.join(BASE_DIR, 'data', 'processed.csv'), sep=';')
data.text = data.text.map(lambda x: preprocess_text(x, False, False, False, False, True))
data = data.dropna()

In [52]:
print(data.head())

                                                text  label
0  imag copyright getti imag sunday morn donald t...      1
1  london reuter flag fli comedydrama vietnam war...      1
2  feud break public view week Mr corker Mr trump...      1
3  mexico citi reuter egypt cheiron hold limit wi...      1
4  2012 kansa lawmak lead gov sam brownback repub...      1


In [53]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [54]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list ])

In [55]:
# Split and tokenize the dataset for w2v
train, test = train_test_split(data, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [24]:
#load pretrained model
wv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(BASE_DIR,"models","GoogleNews-vectors-negative300.bin.gz"), binary=True)
wv.init_sims(replace=True)

In [56]:
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)


In [57]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=2,solver='lbfgs', C=1e5)
logreg = logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=['real','fake']))

accuracy 0.8548044217687075
              precision    recall  f1-score   support

        real       0.87      0.90      0.89      3005
        fake       0.82      0.77      0.79      1699

   micro avg       0.85      0.85      0.85      4704
   macro avg       0.85      0.84      0.84      4704
weighted avg       0.85      0.85      0.85      4704

