# Importing Data

In [1]:
import pandas as pd

#Load movie reviews dataset
df = pd.read_csv( 'fake_or_real_news.csv', nrows=100000)
title = df.title.values
text = df.text.values
label = df.label.values


In [2]:
label

array(['FAKE', 'FAKE', 'REAL', ..., 'FAKE', 'REAL', 'REAL'], dtype=object)

# Cleaning Data

In [3]:
#Find whether a Str is just a number
def is_number(s):
    try: 
        float(s)
        return True
    except ValueError:  
        pass 
    try:
        import unicodedata  
        unicodedata.numeric(s)  
        return True
    except (TypeError, ValueError):
        pass
    return False


#Find whether a Str contain English words or just meanless symbols
def containenglish(str0):
    import re
    return bool(re.search('[a-z]', str0))

# Clean non-English news, useless here

In [44]:
from langdetect import detect
import types
#Separatly get Text with label and Tile with label
Eng_list = []
Text_label = []

Title_list = []
Title_label = []
#clean text
for i in range(len(text)): 
    if containenglish(text[i]) == True and containenglish(title[i]) == True:
        if (detect(text[i]) == 'en') and (detect(title[i]) == 'en'):
            Eng_list.append(text[i])
            Text_label.append(label[i])
            Title_list.append(title[i])


In [45]:
import nltk
# Transform each review string as a list of token strings. Use text here because text is as same as Eng_list
tokenized = [nltk.word_tokenize(review) for review in Eng_list]
tokenized_title = [nltk.word_tokenize(review) for review in Title_list]

In [46]:
from nltk.corpus import stopwords
from string import punctuation
from itertools import chain

def clean_text(tokenized_list, sw, punct, lemmatize=False):
    new_list = []
    for doc in tokenized_list:
        new_list.append([token.lower() for token in doc if token.lower() not in chain(punct, sw)])
    return new_list

# Remove punctuations and stopwords, and lower-case text
sw = stopwords.words('english')
punct = punctuation
cleaned = clean_text(tokenized, sw, punct)
cleaned_title = clean_text(tokenized_title, sw, punct)

In [47]:
import re
for idx in range(len(cleaned)):
    cleaned[idx] = re.sub(r'@([A-Za-z0-9_]+)', "", str(cleaned[idx]))
    cleaned[idx] = re.sub(r"(https|http)\S+", "", str(cleaned[idx]))
    cleaned[idx] = re.sub(r"”|’|“|–|—", "", str(cleaned[idx]))
    
for idx in range(len(cleaned_title)):
    cleaned_title[idx] = re.sub(r'@([A-Za-z0-9_]+)', "", str(cleaned_title[idx]))
    cleaned_title[idx] = re.sub(r"(https|http)\S+", "", str(cleaned_title[idx]))
    cleaned_title[idx] = re.sub(r"”|’|“|–|—", "", str(cleaned_title[idx]))

In [48]:
import numpy as np
num_label = np.where(Text_label == 'FAKE', 0 , 1)
num_label = np.where(Title_label == 'FAKE', 0 , 1)

In [49]:
import spacy
nlp = spacy.load('en_core_web_sm')
# Update puncuation list in spacy
nlp.vocab["$"].is_punct = True
nlp.vocab["|"].is_punct = True
nlp.vocab["+"].is_punct = True
nlp.vocab["<"].is_punct = True
nlp.vocab[">"].is_punct = True
nlp.vocab["="].is_punct = True
nlp.vocab["^"].is_punct = True
nlp.vocab["`"].is_punct = True
nlp.vocab["~"].is_stop = True
nlp.vocab["s"].is_stop = True
nlp.vocab["t"].is_stop = True # change to not

In [50]:
# helper function to eliminate tokens that are pure punctuation, whitespace, or stopword
# can be updated based on desired filtering 

def process_txt(token):
    return token.is_punct or token.is_space or token.is_stop or token.like_num

In [51]:
# function to take array of articles and turn them into nested list of tokens
def lemmatize_txt(array):
    lemma = []
    
    for doc in nlp.pipe(array, batch_size=50,
                        n_threads=-1):
        if doc.is_parsed:
            lemma.append([n.lemma_ for n in doc if not process_txt(n)])
        
        else:
            lemma.append(None)
    
    return lemma

In [52]:
# Create nested list of tokens for each article
lem = lemmatize_txt(cleaned)
lem_title = lemmatize_txt(cleaned_title)

In [53]:
newLem = []
newLem_title = []
for doc in lem: 
    finalString = ', '.join(doc)
    newLem.append(finalString)
    
for doc in lem_title: 
    finalString = ', '.join(doc)
    newLem_title.append(finalString)

In [54]:
newLem_title[1]

'watch, exact, moment, paul, ryan, commit, political, suicide, trump, rally, video'

# preprocessing the text， using TF-IDF

In [55]:
#print(newLem.size)
type(label)
len(newLem)

6008

In [56]:
#split data into training set and test set
from sklearn.model_selection import train_test_split

X = newLem
Y = Text_label
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, stratify = Y)

In [57]:
#build the model
from sklearn.feature_extraction.text import TfidfVectorizer
tdf = TfidfVectorizer()
x_tfidf_train = tdf.fit_transform(x_train)
x_tfidf_test = tdf.transform(x_test)

In [58]:
#assess this model,using naive bayes
from sklearn.naive_bayes import MultinomialNB
tdf_model = MultinomialNB().fit(x_tfidf_train, y_train)
train_score = tdf_model.score(x_tfidf_train, y_train)
test_score = tdf_model.score(x_tfidf_test, y_test)
print(train_score)
print(test_score)

0.8877018633540372
0.8204740292486132


# preprocessing the text， using doc2vec

In [49]:
#build the dictionary and corpus
from gensim import corpora

dictionary = corpora.Dictionary([cleaned])
corpus = [dictionary.doc2bow([text]) for text in cleaned]

In [50]:
#Build a vocabulary
from gensim.models.doc2vec import TaggedDocument
vocabulary = [TaggedDocument(doc, tags=[idx]) for idx, doc in enumerate(cleaned)]

In [51]:
from gensim.models import Doc2Vec
doc2vec = Doc2Vec(vector_size=300, window=5, min_count=5, dm = 1, epochs=10)
doc2vec.build_vocab(vocabulary)

#Train the d2v model
doc2vec.train(vocabulary, epochs=10, total_examples=doc2vec.corpus_count)

#Build a new matrix that can fit classifier
doc2vec_list = np.zeros((len(cleaned),300))
for i in range (len(cleaned)):
    doc2vec_list[i] = doc2vec.infer_vector([cleaned[i]])


In [52]:
X = doc2vec_list
Y = label
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, stratify = Y)