In [1]:
import pandas as pd
import sklearn
import numpy as np
import os
import random
import csv
import sys

maxInt = sys.maxsize
csv.field_size_limit(100000000)

seed = 2
random.seed(seed)

In [3]:
# Ran into some trouble parsing
# import dask
# import dask.dataframe as dd
# ddf = dd.read_csv('./work/Datasets/FakeNewsCorpus/news_cleaned_2018_02_13.csv', engine='python', encoding='utf-8', error_bad_lines=False, header=0, dtype={'type': str, 'content': str,'authors': str}, usecols=['type', 'content', 'authors'])

# ddf = ddf[(ddf['type'] == 'fake') | (ddf['type'] == 'reliable')]

# ddf.to_parquet('./work/cleaned_data_parquet')
# print("DONE")

In [7]:
# Extract out only fake and reliable labelled data

csv_iter = pd.read_csv('./work/Datasets/FakeNewsCorpus/news_cleaned_2018_02_13.csv', engine='python', iterator=True, chunksize=50000, encoding='utf-8', header=0, error_bad_lines=False, dtype={'type': str, 'content': str,'authors': str}, usecols=['type', 'content', 'authors'])

for single_df in csv_iter:
    single_df = single_df[(single_df.type == 'fake') | (single_df.type == 'reliable')]
    with open('./work/cleaned_data2.csv', 'a') as f:
        single_df.to_csv(f, header=False, columns=['type', 'content', 'authors'])

print("DONE")

DONE


In [2]:
temp_df = pd.read_csv('./work/Datasets/FakeNewsCorpus/cleaned_data2.csv', engine='python', names=['id', 'type', 'content', 'authors'])
temp_df.head(10)

Unnamed: 0,id,type,content,authors
0,27,fake,Headline: Bitcoin & Blockchain Searches Exceed...,The Pirate'S Cove
1,28,fake,Water Cooler 1/25/18 Open Thread; Fake News ? ...,
2,29,fake,Veteran Commentator Calls Out the Growing “Eth...,
3,30,fake,"Lost Words, Hidden Words, Otters, Banks and Bo...",Jackie Morris Artist
4,31,fake,Red Alert: Bond Yields Are SCREAMING “Inflatio...,Phoenix Capital Research
5,32,fake,Scientists move Doomsday Clock ahead by 30 sec...,Desdemona Despair
6,33,fake,Why Sandwiches Must Be Banned\n\n% of readers ...,
7,34,fake,Poll: Calls for War From Israelis and Palestin...,
8,42,reliable,"Facebook/DestinyTheGame Promo image for ""Desti...",
9,58,fake,Every college basketball fan knows all too wel...,"Avid Sports Fan, Writer. Huge Fan Of The Oriol..."


In [9]:
temp_df = temp_df.dropna()
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1735772 entries, 0 to 2807967
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   id       int64 
 1   type     object
 2   content  object
 3   authors  object
dtypes: int64(1), object(3)
memory usage: 66.2+ MB


In [10]:
# Splitting dataframe into train and test dataframe
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(temp_df, test_size=0.3, random_state=42)

train_df.to_csv('./work/Datasets/FakeNewsCorpus/train_data.csv')
test_df.to_csv('./work/Datasets/FakeNewsCorpus/test_data.csv')
print("Finished saving training and testing data")

Finished saving training and testing data


In [11]:
train_df.head(10)

Unnamed: 0,id,type,content,authors
1969462,7690584,reliable,"“On this issue, Democrats in the Senate start ...","Jeff Zeleny, Carl Hulse"
2790161,8511283,reliable,"Photo\n\nFor nearly a decade, Lindsey Vonn has...",Bill Pennington
945251,6666361,reliable,Chris Paul says his right shoulder is fine; he...,Broderick Turner
1984902,7706024,reliable,Elephantiasis causes one’s legs to become grot...,Nicholas Kristof
2564582,8285704,reliable,"What they get, he went on, is “depth of flavor...",Jeff Gordinier
198168,2208918,fake,Athens: Fights Erupt Between Hoodies and Stude...,Wired Greek
2301661,8022783,reliable,Ronni Favors and Bill Colavito were married Sa...,Rosalie R. Radomsky
2666162,8387284,reliable,"These incidents, and too many other serious in...",Ethan Bregman
1073174,6794286,reliable,"ABC News' ""The Final Sprint"" features Tom Llam...",Abc News
2739691,8460813,reliable,But her journey doesn’t only reflect the advan...,Frank Bruni


In [12]:
test_df.head(10)

Unnamed: 0,id,type,content,authors
1833036,7554158,reliable,"WASHINGTON, March 2 - Leaders of the House Int...",Eric Lichtblau
483060,3883047,fake,(Before It's News)\n\nThe Turkish government s...,Reason Magazine
2331574,8052696,reliable,Extra miles are also part of her nonwriting li...,Laurel Graeber
2100108,7821230,reliable,The gymnastics chickens still may come home to...,"George Vecsey, Sports Of The Times"
79038,536725,fake,\n\nIt is unclear how many undocumented immigr...,Retro Housewife
41116,80534,fake,Is There Something Else Going-On Many of Us ha...,Colonel Sixx
164924,1668017,fake,This Congressional Hearing Will Blow You Away!...,"Patriarchate Of Moscow, All Russia"
345226,3202369,fake,Incredible! Here's Why the Defecting NK Soldie...,Wels - Through My Bible On Streams
513393,4004240,fake,This Week in Guns 104 – Handgun Rights Victory...,Firearms Radio Network
82518,654243,fake,Stirling’s Formula!\n\nHeadline: Bitcoin & Blo...,Bradley J Roth


# NAIVE BAYES

In [13]:
X_train = train_df['content']
Y_train = train_df['type']
X_test = test_df['content']
Y_test = test_df['type']

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [15]:
text_classification_model = Pipeline([('tfidf', TfidfVectorizer()), ('bayes', MultinomialNB())])

In [16]:
text_classification_model = text_classification_model.fit(X_train, Y_train)

In [17]:
predicted = text_classification_model.predict(X_test)
np.mean(predicted==Y_test)

0.8802589431799851

# SVM

In [18]:
from sklearn.linear_model import SGDClassifier
tc_model_svm = Pipeline([('tfidf', TfidfVectorizer()), ('svm', SGDClassifier(loss='hinge', alpha=1e-3, penalty='l2',random_state=42))])
tc_model_svm = tc_model_svm.fit(X_train, Y_train)

In [19]:
predicted_svm = tc_model_svm.predict(X_test)
np.mean(predicted_svm == Y_test)

0.9114515720178518