# About

dataset : 
https://www.kaggle.com/anthonyc1/gathering-real-news-for-oct-dec-2016
https://www.kaggle.com/mrisdal/fake-news

After Day 6 and 7, testing extracted features and tuning parameters.

In [82]:
# In this project, we are going to classify the news into fake and real
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# tfidf for feature extraction from text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import ML models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

#  Metrices for model evalaution 
from sklearn import metrics
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

# save model and intermedicate data
from sklearn.externals import joblib

In [83]:
df = pd.read_csv(r'dataset/news_dataset.csv')

In [84]:
print(df.shape)
print("\n\n")
print(df.columns)
print("\n\n")
df.head(5)

(28711, 5)



Index(['Unnamed: 0', 'title', 'content', 'publication', 'label'], dtype='object')





Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


In [85]:
df.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [86]:
df.columns

Index(['title', 'content', 'publication', 'label'], dtype='object')

In [87]:
df.isnull().sum().tolist()

[680, 46, 0, 0]

In [88]:
df = df.dropna(how= "any")

In [89]:
# merge title and content for further text preprocessing
df.title = df.title.astype(str)
df.content = df.content.astype(str)

df['newstext'] = df[ ['title' , 'content'] ].apply( lambda t : ' '.join( t ) , axis = 1)

In [9]:
df.newstext.head(15)

0     Muslims BUSTED: They Stole Millions In Gov’t B...
1     Re: Why Did Attorney General Loretta Lynch Ple...
2     BREAKING: Weiner Cooperating With FBI On Hilla...
3     PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...
4     FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...
5     Hillary Goes Absolutely Berserk On Protester A...
6     BREAKING! NYPD Ready To Make Arrests In Weiner...
7     WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...
8     BREAKING: CLINTON CLEARED...Was This A Coordin...
9     EVIL HILLARY SUPPORTERS Yell "F*ck Trump"…Burn...
10    YIKES! HILLARY GOES OFF THE RAILS…Pulls A Howa...
11    SAY GOODBYE! These 23 Hollywood Celebs Threate...
12    NOT KIDDING: Colleges Give Students “Safe Spac...
13    BOOM! MATH SHOWS Trump Would Have Beaten Obama...
14    BOOM! This Is How President Reagan Handled Pro...
Name: newstext, dtype: object

In [90]:
# drop content and title
df.drop(columns= "title", axis=1, inplace=True)
df.drop(columns="content", axis=1, inplace=True)

In [91]:
df['newstext'] = df['newstext'].apply( lambda t : " ".join( t.split() ) )
df.head(15)

Unnamed: 0,publication,label,newstext
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn..."


In [92]:
# feature extraction: number of UPPERCASE Letters
def uppercase_count(t):
    upper_list = []
    for word in t.split():
        if word.isupper():
            upper_list.append(word)
    return len(upper_list)

df['uppercase_count'] = df.newstext.apply(lambda t : uppercase_count(t) )
df.head(15)

Unnamed: 0,publication,label,newstext,uppercase_count
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11


In [93]:
# Feature extraction : number of words from
df['word_count'] = df.newstext.apply( lambda t : len(str(t).split(" ") ) )
df.head(15)

Unnamed: 0,publication,label,newstext,uppercase_count,word_count
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2,95
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2,287
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4,233
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12,69
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14,346
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3,192
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32,989
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14,400
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12,228
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11,152


In [94]:
#  Feature : Average word length for content
def avg_word_length(t):
    words = t.split()
    return ( sum( len(word) for word in words ) / len(words))

df['avg_word_len'] = df.newstext.apply( lambda t : avg_word_length(t) )
df.head(15)

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2,95,4.873684
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2,287,5.432056
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4,233,5.128755
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12,69,4.753623
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14,346,5.32948
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3,192,5.84375
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32,989,5.140546
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14,400,5.705
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12,228,5.162281
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11,152,4.697368


In [95]:
# text preprocessing : Lowercasing
df.newstext = df.newstext.apply( lambda t : ' '.join( word.lower() for word in t.split() ) )
df.newstext.head(5)

0    muslims busted: they stole millions in gov’t b...
1    re: why did attorney general loretta lynch ple...
2    breaking: weiner cooperating with fbi on hilla...
3    pin drop speech by father of daughter kidnappe...
4    fantastic! trump's 7 point plan to reform heal...
Name: newstext, dtype: object

In [96]:
# text preprocessing : punctuation removal
# keep only words and spaces
df.newstext = df.newstext.str.replace( "[^\w\s]" , "" )
df.newstext.head(5)

0    muslims busted they stole millions in govt ben...
1    re why did attorney general loretta lynch plea...
2    breaking weiner cooperating with fbi on hillar...
3    pin drop speech by father of daughter kidnappe...
4    fantastic trumps 7 point plan to reform health...
Name: newstext, dtype: object

In [97]:
# Remove stop words
# fetch stop words from NLTK corpus
from nltk.corpus import stopwords
stop = stopwords.words('english')

df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in stop ) )
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi hillary email ...
3    pin drop speech father daughter kidnapped kill...
4    fantastic trumps 7 point plan reform healthcar...
Name: newstext, dtype: object

In [98]:
# Remove frequent words
all_words = ' '.join(df.newstext ).split()

# let's keep the threshold of 20 K which almost equal to number of data instances in the dataset
freq_words = pd.Series(all_words).value_counts()[:30]

In [99]:
freq_words

said          92872
trump         88244
one           57036
would         57035
people        55999
new           44186
us            42986
clinton       42412
like          39191
also          37771
could         32478
time          32317
president     30375
even          29477
many          28690
years         28603
state         28529
election      28262
first         26434
two           24654
states        24639
hillary       23358
government    22313
world         22256
campaign      22087
get           21994
trumps        21089
way           20879
donald        20810
last          20518
dtype: int64

In [100]:
# remove freq_words
df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in freq_words)) 
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi email investig...
3    pin drop speech father daughter kidnapped kill...
4    fantastic 7 point plan reform healthcare begin...
Name: newstext, dtype: object

In [101]:
# remove rare words
all_words = " ".join( df.newstext ).split()
rare_words = pd.Series( all_words ).value_counts()[ -230000 : ]
rare_words.sort_values

<bound method Series.sort_values of rhino                   61
libyas                  61
sweater                 61
hotter                  61
erect                   61
fashioned               61
retaking                61
admirers                61
roaring                 61
workplaces              61
nauru                   61
lends                   61
vigilantes              61
charade                 61
prism                   61
tan                     61
memorably               61
jabin                   61
launchers               61
devaluation             61
232                     61
unrelenting             61
semblance               61
bikes                   61
coronation              61
albright                61
fluent                  61
unknowns                61
elevating               61
dicaprio                61
                        ..
17524                    1
skepsis                  1
effia                    1
witted                   1
abscessed          

In [102]:
# remove rare_words
df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in rare_words))
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi email investig...
3    pin drop speech father daughter kidnapped kill...
4    fantastic 7 point plan reform healthcare begin...
Name: newstext, dtype: object

In [103]:
# remove numbers
df.newstext = df.newstext.apply(lambda t: " ".join(word for word in t.split() if not word.isnumeric() ) )

In [104]:
# lemmatization
import nltk
nltk.download('wordnet')
from textblob import Word

df.newstext = df.newstext.apply( lambda t : " ".join([Word(word).lemmatize() for word in t.split()]))
df.newstext.head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    muslim busted stole million govt benefit print...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi email investig...
3    pin drop speech father daughter kidnapped kill...
4    fantastic point plan reform healthcare begin b...
Name: newstext, dtype: object

In [105]:
df.publication.value_counts()

Reuters                       2307
NPR                           2014
Washington Post               1886
Guardian                      1880
CNN                           1624
Atlantic                      1558
New York Times                1428
Fox News                      1132
National Review                973
Vox                            910
presstv                        121
dcclothesline                  100
fellowshipoftheminds           100
thesaker                       100
awdnews                        100
godlikeproductions             100
abovetopsecret                 100
davidstockmanscontracorner     100
truthdig                       100
thedailymash                   100
disclose                       100
elmundotoday                   100
washingtonsblog                100
occupydemocrats                100
counterpunch                   100
madworldnews                   100
veteransnewsnow                100
mintpressnews                  100
thepoke             

In [106]:
# Frequency distribution to convert publication into numeric feature
publication_keys = df['publication'].drop_duplicates().values.tolist()
publication_keys.sort()
print("Actual values of publication \n")
#print(publication_keys)

# put count of each value of publication in a list
publication_values = df.publication.value_counts().sort_index().tolist()
print("\n publication value counts:-")
#print(publication_values)

# calculate frequency distribution
publication_values = [x / sum(publication_values) for x in publication_values]
print("\nFreq dist:-")
#print(publication_values)

publication_dict = dict(zip(publication_keys, publication_values))
print("\n publication's dictionary with actual values and their corresponding freq. dist.:-")
print(publication_dict)

# now map these freq dist valeus to the dataframe 
# replace values of publication with their freq dist
df['publication_num'] = df['publication'].map(publication_dict)
df.head(5)

Actual values of publication 


 publication value counts:-

Freq dist:-

 publication's dictionary with actual values and their corresponding freq. dist.:-
{'100percentfedup': 0.0011792031445417189, '21stcenturywire': 0.0008576022869394318, 'Atlantic': 0.05567268179381812, 'CNN': 0.058031088082901555, 'Fox News': 0.0404502412006432, 'Guardian': 0.06717884581025549, 'NPR': 0.07196712524566733, 'National Review': 0.03476862604966947, 'New York Times': 0.051027336072896196, 'Reuters': 0.08243701983205289, 'Vox': 0.03251742004645346, 'Washington Post': 0.06739324638199035, 'abcnews': 7.146685724495266e-05, 'abeldanger': 0.0007146685724495265, 'abovetopsecret': 0.003573342862247633, 'activistpost': 0.003573342862247633, 'addictinginfo': 0.0007146685724495265, 'adobochronicles': 0.00046453457209219226, 'ahtribune': 0.002394139717705914, 'allnewspipeline': 3.573342862247633e-05, 'americanlookout': 0.00028586742897981064, 'americannews': 7.146685724495266e-05, 'americasfreedomfighters': 0.000

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len,publication_num
0,100percentfedup,fake,muslim busted stole million govt benefit print...,2,95,4.873684,0.001179
1,100percentfedup,fake,attorney general loretta lynch plead fifth att...,2,287,5.432056,0.001179
2,100percentfedup,fake,breaking weiner cooperating fbi email investig...,4,233,5.128755,0.001179
3,100percentfedup,fake,pin drop speech father daughter kidnapped kill...,12,69,4.753623,0.001179
4,100percentfedup,fake,fantastic point plan reform healthcare begin b...,14,346,5.32948,0.001179


In [107]:
# Convert our target -- label (categorical) into numeric format
df['label'] = df['label'].factorize()[0]
df['label'].value_counts()

# real    15712
# fake    12273

1    15712
0    12273
Name: label, dtype: int64

In [108]:
df.isnull().sum().tolist()

[0, 0, 0, 0, 0, 0, 0]

In [109]:
df.head(5)

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len,publication_num
0,100percentfedup,0,muslim busted stole million govt benefit print...,2,95,4.873684,0.001179
1,100percentfedup,0,attorney general loretta lynch plead fifth att...,2,287,5.432056,0.001179
2,100percentfedup,0,breaking weiner cooperating fbi email investig...,4,233,5.128755,0.001179
3,100percentfedup,0,pin drop speech father daughter kidnapped kill...,12,69,4.753623,0.001179
4,100percentfedup,0,fantastic point plan reform healthcare begin b...,14,346,5.32948,0.001179


In [110]:
tfidf = TfidfVectorizer()
tfidf_features = tfidf.fit_transform( df.newstext ).toarray()
#tfidf_df = pd.DataFrame(features)

In [111]:
tfidf_features.shape

(27985, 14297)

In [112]:
extracted_features = df[[ 'publication_num', 'uppercase_count', 'word_count' , 'avg_word_len']]

In [113]:
extracted_features.shape

(27985, 4)

In [114]:
#final_features = pd.concat([features_df, other_features], axis=1)
#final_features.shape
#final_features.head(15)

In [115]:
#ind = final_features.index.tolist()

In [116]:
#ind = ind[27985:]
#print(len(ind))
#print(ind)

In [117]:
#final_features.drop(final_features.index[ind])

In [118]:
#final_features.isnull().sum().tolist()

In [119]:
#final_features.dropna(how="any")

In [120]:
#final_features.isnull().sum().tolist()

In [121]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split( tfidf_features , 
                                                                            df.label, test_size = 0.25,
                                                                            random_state = 0)

In [122]:
X_train_ex, X_test_ex, y_train_ex, y_test_ex = train_test_split( extracted_features , 
                                                                            df.label, test_size = 0.25,
                                                                            random_state = 0)

In [124]:
# training on tfidf

lr_tfidf = LogisticRegression(random_state=0)
lr_tfidf.fit(X_train_tfidf, y_train_tfidf)
print("Accuracy of LogisticRegression on TF-IDF = ")
print(lr_tfidf.score(X_test_tfidf, y_test_tfidf))
print("\n")


svm_tfidf = LinearSVC()
svm_tfidf.fit( X_train_tfidf, y_train_tfidf )
print("Accuracy of LinearSVC on TF-IDF = ")
print(svm_tfidf.score(X_test_tfidf, y_test_tfidf))




Accuracy of LogisticRegression on TF-IDF = 
0.9161069029584108


Accuracy of LinearSVC on TF-IDF = 
0.9328283550092897


In [125]:
# training on Extracted features
rf_ex = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
rf_ex.fit( X_train_ex, y_train_ex )
print("Accuracy of RandomForestClassifier on other extracted features = ")
print( rf_ex.score(X_test_ex, y_test_ex) )
print("\n")


lr_ex = LogisticRegression(random_state=0)
lr_ex.fit(X_train_ex, y_train_ex)
print("Accuracy of LogisticRegression on other extracted features = ")
print( lr_ex.score(X_test_ex, y_test_ex) )


Accuracy of RandomForestClassifier on other extracted features = 
1.0


Accuracy of LogisticRegression on other extracted features = 
0.9941403458625125


