# About

dataset : 
https://www.kaggle.com/anthonyc1/gathering-real-news-for-oct-dec-2016
https://www.kaggle.com/mrisdal/fake-news

This project we will classify news into one of the two categories -fake or real. It consist of following steps:

1) Feature Extraction from Text

2) Text Data Preprocessing

3) TF-IDF calculation

4) Cross validation for model evaluation

5) Model selection, saving the model for future predictions



In [185]:
# In this project, we are going to classify the news into fake and real
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# tfidf for feature extraction from text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import ML models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

#  Metrices for model evalaution 
from sklearn import metrics
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

# save model and intermedicate data
from sklearn.externals import joblib

In [192]:
df = pd.read_csv(r'dataset/news_dataset.csv')

In [193]:
print(df.shape)
print("\n\n")
print(df.columns)
print("\n\n")
print(df.head(5))

(28711, 5)



Index(['Unnamed: 0', 'title', 'content', 'publication', 'label'], dtype='object')



   Unnamed: 0                                              title  \
0           0  Muslims BUSTED: They Stole Millions In Gov’t B...   
1           1  Re: Why Did Attorney General Loretta Lynch Ple...   
2           2  BREAKING: Weiner Cooperating With FBI On Hilla...   
3           3  PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...   
4           4  FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...   

                                             content      publication label  
0  Print They should pay all the back all the mon...  100percentfedup  fake  
1  Why Did Attorney General Loretta Lynch Plead T...  100percentfedup  fake  
2  Red State : \nFox News Sunday reported this mo...  100percentfedup  fake  
3  Email Kayla Mueller was a prisoner and torture...  100percentfedup  fake  
4  Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...  100percentfedup  fake  


In [194]:
df.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [195]:
df.columns

Index(['title', 'content', 'publication', 'label'], dtype='object')

In [196]:
df.isnull().sum().tolist()

[680, 46, 0, 0]

In [197]:
df = df.dropna(how= "any")

In [198]:
# merge title and content for further text preprocessing
df.title = df.title.astype(str)
df.content = df.content.astype(str)

df['newstext'] = df[ ['title' , 'content'] ].apply( lambda t : ' '.join( t ) , axis = 1)

In [114]:
df.newstext.head(15)

0     Muslims BUSTED: They Stole Millions In Gov’t B...
1     Re: Why Did Attorney General Loretta Lynch Ple...
2     BREAKING: Weiner Cooperating With FBI On Hilla...
3     PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...
4     FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...
5     Hillary Goes Absolutely Berserk On Protester A...
6     BREAKING! NYPD Ready To Make Arrests In Weiner...
7     WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...
8     BREAKING: CLINTON CLEARED...Was This A Coordin...
9     EVIL HILLARY SUPPORTERS Yell "F*ck Trump"…Burn...
10    YIKES! HILLARY GOES OFF THE RAILS…Pulls A Howa...
11    SAY GOODBYE! These 23 Hollywood Celebs Threate...
12    NOT KIDDING: Colleges Give Students “Safe Spac...
13    BOOM! MATH SHOWS Trump Would Have Beaten Obama...
14    BOOM! This Is How President Reagan Handled Pro...
Name: newstext, dtype: object

In [199]:
# drop content and title
df.drop(columns= "title", axis=1, inplace=True)
df.drop(columns="content", axis=1, inplace=True)

In [200]:
df['newstext'] = df['newstext'].apply( lambda t : " ".join( t.split() ) )
df.head(15)

Unnamed: 0,publication,label,newstext
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn..."


In [201]:
# feature extraction: number of UPPERCASE Letters

def uppercase_count(t):
    upper_list = []
    for word in t.split():
        if word.isupper():
            upper_list.append(word)
    return len(upper_list)


df['uppercase_count'] = df.newstext.apply(lambda t : uppercase_count(t) )
df.head(15)


Unnamed: 0,publication,label,newstext,uppercase_count
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11


In [202]:
# Feature extraction : number of words from
df['word_count'] = df.newstext.apply( lambda t : len(str(t).split(" ") ) )
df.head(15)


Unnamed: 0,publication,label,newstext,uppercase_count,word_count
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2,95
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2,287
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4,233
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12,69
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14,346
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3,192
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32,989
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14,400
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12,228
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11,152


In [203]:
#  Feature : Average word length for content
def avg_word_length(t):
    words = t.split()
    return ( sum( len(word) for word in words ) / len(words))

df['avg_word_len'] = df.newstext.apply( lambda t : avg_word_length(t) )
df.head(15)

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len
0,100percentfedup,fake,Muslims BUSTED: They Stole Millions In Gov’t B...,2,95,4.873684
1,100percentfedup,fake,Re: Why Did Attorney General Loretta Lynch Ple...,2,287,5.432056
2,100percentfedup,fake,BREAKING: Weiner Cooperating With FBI On Hilla...,4,233,5.128755
3,100percentfedup,fake,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,12,69,4.753623
4,100percentfedup,fake,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,14,346,5.32948
5,100percentfedup,fake,Hillary Goes Absolutely Berserk On Protester A...,3,192,5.84375
6,100percentfedup,fake,BREAKING! NYPD Ready To Make Arrests In Weiner...,32,989,5.140546
7,100percentfedup,fake,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,14,400,5.705
8,100percentfedup,fake,BREAKING: CLINTON CLEARED...Was This A Coordin...,12,228,5.162281
9,100percentfedup,fake,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",11,152,4.697368


In [204]:
# text preprocessing : Lowercasing
df.newstext = df.newstext.apply( lambda t : ' '.join( word.lower() for word in t.split() ) )
df.newstext.head(5)

0    muslims busted: they stole millions in gov’t b...
1    re: why did attorney general loretta lynch ple...
2    breaking: weiner cooperating with fbi on hilla...
3    pin drop speech by father of daughter kidnappe...
4    fantastic! trump's 7 point plan to reform heal...
Name: newstext, dtype: object

In [205]:
# text preprocessing : punctuation removal
# keep only words and spaces
df.newstext = df.newstext.str.replace( "[^\w\s]" , "" )
df.newstext.head(5)

0    muslims busted they stole millions in govt ben...
1    re why did attorney general loretta lynch plea...
2    breaking weiner cooperating with fbi on hillar...
3    pin drop speech by father of daughter kidnappe...
4    fantastic trumps 7 point plan to reform health...
Name: newstext, dtype: object

In [206]:
# Remove stop words
# fetch stop words from NLTK corpus
from nltk.corpus import stopwords
stop = stopwords.words('english')

df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in stop ) )
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi hillary email ...
3    pin drop speech father daughter kidnapped kill...
4    fantastic trumps 7 point plan reform healthcar...
Name: newstext, dtype: object

In [207]:
# Remove frequent words
all_words = ' '.join(df.newstext ).split()

# let's keep the threshold of 28 K which almost equal to number of data instances in the dataset
freq_words = pd.Series(all_words).value_counts()[:20]

In [208]:
freq_words

said         92872
trump        88244
one          57036
would        57035
people       55999
new          44186
us           42986
clinton      42412
like         39191
also         37771
could        32478
time         32317
president    30375
even         29477
many         28690
years        28603
state        28529
election     28262
first        26434
two          24654
dtype: int64

In [209]:
# remove freq_words
df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in freq_words)) 
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi hillary email ...
3    pin drop speech father daughter kidnapped kill...
4    fantastic trumps 7 point plan reform healthcar...
Name: newstext, dtype: object

In [210]:
# remove rare words
all_words = " ".join( df.newstext ).split()
rare_words = pd.Series( all_words ).value_counts()[ -200000 : ]
rare_words.sort_values

<bound method Series.sort_values of lucys                                            10
brusatte                                         10
equivocal                                        10
crivella                                         10
disgustingly                                     10
objetivos                                        10
benzema                                          10
gonidakis                                        10
depinto                                          10
inessa                                           10
maup                                             10
unifies                                          10
estadio                                          10
empirically                                      10
subfreezing                                      10
glowingly                                        10
ukbased                                          10
26000                                            10
relinquishing               

In [211]:
# remove rare_words
df.newstext = df.newstext.apply( lambda t: " ".join( word for word in t.split() if word not in rare_words))
df.newstext.head(5)

0    muslims busted stole millions govt benefits pr...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi hillary email ...
3    pin drop speech father daughter kidnapped kill...
4    fantastic trumps 7 point plan reform healthcar...
Name: newstext, dtype: object

In [212]:
# remove numbers
df.newstext = df.newstext.apply(lambda t: " ".join(word for word in t.split() if not word.isnumeric() ) )

In [213]:
# lemmatization
import nltk
nltk.download('wordnet')
from textblob import Word

df.newstext = df.newstext.apply( lambda t : " ".join([Word(word).lemmatize() for word in t.split()]))
df.newstext.head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    muslim busted stole million govt benefit print...
1    attorney general loretta lynch plead fifth att...
2    breaking weiner cooperating fbi hillary email ...
3    pin drop speech father daughter kidnapped kill...
4    fantastic trump point plan reform healthcare b...
Name: newstext, dtype: object

In [214]:
df.publication.value_counts()

Reuters                    2307
NPR                        2014
Washington Post            1886
Guardian                   1880
CNN                        1624
Atlantic                   1558
New York Times             1428
Fox News                   1132
National Review             973
Vox                         910
presstv                     121
trueactivist                100
dcclothesline               100
awdnews                     100
thesaker                    100
ifyouonlynews               100
libertyunyielding           100
veteranstoday               100
jewsnews                    100
pravda                      100
trunews                     100
wnd                         100
theeventchronicle           100
shiftfrequency              100
thetruthseeker              100
amren                       100
opednews                    100
pakalertpress               100
abovetopsecret              100
pravdareport                100
                           ... 
reductre

In [215]:
# Frequency distribution to convert publication into numeric feature
publication_keys = df['publication'].drop_duplicates().values.tolist()
publication_keys.sort()
print("Actual values of publication \n")
print(publication_keys)

# put count of each value of publication in a list
publication_values = df.publication.value_counts().sort_index().tolist()
print("\n publication value counts:-")
print(publication_values)

# calculate frequency distribution
publication_values = [x / sum(publication_values) for x in publication_values]
print("\nFreq dist:-")
print(publication_values)

publication_dict = dict(zip(publication_keys, publication_values))
print("\n publication's dictionary with actual values and their corresponding freq. dist.:-")
print(publication_dict)

# now map these freq dist valeus to the dataframe 
# replace values of publication with their freq dist
df['publication_num'] = df['publication'].map(publication_dict)
df.head(5)

Actual values of publication 

['100percentfedup', '21stcenturywire', 'Atlantic', 'CNN', 'Fox News', 'Guardian', 'NPR', 'National Review', 'New York Times', 'Reuters', 'Vox', 'Washington Post', 'abcnews', 'abeldanger', 'abovetopsecret', 'activistpost', 'addictinginfo', 'adobochronicles', 'ahtribune', 'allnewspipeline', 'americanlookout', 'americannews', 'americasfreedomfighters', 'amren', 'amtvmedia', 'anonews', 'anonhq', 'antiwar', 'awdnews', 'barenakedislam', 'beforeitsnews', 'betootaadvocate', 'bigbluevision', 'bignuggetnews', 'bipartisanreport', 'blackagendareport', 'blacklistednews', 'breitbart', 'christiantimesnewspaper', 'chronicle', 'clickhole', 'coasttocoastam', 'collective-evolution', 'consciouslifenews', 'conservativedailypost', 'conservativetribune', 'consortiumnews', 'corbettreport', 'countercurrents', 'counterpunch', 'dailysquib', 'dailystormer', 'dailywire', 'darkmoon', 'davidduke', 'davidstockmanscontracorner', 'davidwolfe', 'dcclothesline', 'defenddemocracy', 'dennismi

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len,publication_num
0,100percentfedup,fake,muslim busted stole million govt benefit print...,2,95,4.873684,0.001179
1,100percentfedup,fake,attorney general loretta lynch plead fifth att...,2,287,5.432056,0.001179
2,100percentfedup,fake,breaking weiner cooperating fbi hillary email ...,4,233,5.128755,0.001179
3,100percentfedup,fake,pin drop speech father daughter kidnapped kill...,12,69,4.753623,0.001179
4,100percentfedup,fake,fantastic trump point plan reform healthcare b...,14,346,5.32948,0.001179


In [216]:
# Convert our target -- label (categorical) into numeric format
df['label'] = df['label'].factorize()[0]
df['label'].value_counts()

# real    15712
# fake    12999

1    15712
0    12273
Name: label, dtype: int64

In [217]:
df.head(5)

Unnamed: 0,publication,label,newstext,uppercase_count,word_count,avg_word_len,publication_num
0,100percentfedup,0,muslim busted stole million govt benefit print...,2,95,4.873684,0.001179
1,100percentfedup,0,attorney general loretta lynch plead fifth att...,2,287,5.432056,0.001179
2,100percentfedup,0,breaking weiner cooperating fbi hillary email ...,4,233,5.128755,0.001179
3,100percentfedup,0,pin drop speech father daughter kidnapped kill...,12,69,4.753623,0.001179
4,100percentfedup,0,fantastic trump point plan reform healthcare b...,14,346,5.32948,0.001179


In [218]:
tfidf = TfidfVectorizer()
features = tfidf.fit_transform( df.newstext ).toarray()
features_df = pd.DataFrame(features)

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# define cross validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, df.label, scoring='accuracy', cv=CV)
    
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()


In [177]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.670954
1,RandomForestClassifier,1,0.672026
2,RandomForestClassifier,2,0.687869
3,RandomForestClassifier,3,0.674768
4,RandomForestClassifier,4,0.646355
5,LinearSVC,0,0.901572
6,LinearSVC,1,0.878528
7,LinearSVC,2,0.880293
8,LinearSVC,3,0.922445
9,LinearSVC,4,0.901358


In [178]:
# save 5 - cross val score of all the models for model evaluation
joblib.dump( cv_df, r'Day 5-6-7/5-cross-val-score' )

['Day 5-6-7/5-cross-val-score']

In [165]:
# As two models are performing way better than others:
# LinearSVC -89.7 %, LogisticRegression - 88.6%
# let's dig deeper and see the Precision, Recall, F1 Score
X_train, X_test, y_train, y_test = train_test_split( features, df.label, test_size = 0.25, random_state = 0)

In [169]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

print("Accuracy of LogisticRegression = ")
print(lr.score( X_test, y_test ))
print("\n")

y_pred_lr = lr.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_lr))
print("\n")
print(classification_report(y_test, y_pred_lr))



Accuracy of LogisticRegression = 
0.9175360868943833


Confusion Matrix
[[2704  337]
 [ 240 3716]]


              precision    recall  f1-score   support

           0       0.92      0.89      0.90      3041
           1       0.92      0.94      0.93      3956

   micro avg       0.92      0.92      0.92      6997
   macro avg       0.92      0.91      0.92      6997
weighted avg       0.92      0.92      0.92      6997



In [170]:
svm = LinearSVC()
svm.fit(X_train, y_train)

print("Accuracy of LinearSVC = ")
print(svm.score( X_test, y_test ))
print("\n")

y_pred_svm = svm.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_svm))
print("\n")
print(classification_report(y_test, y_pred_svm))

Accuracy of LinearSVC = 
0.9371159068172074


Confusion Matrix
[[2816  225]
 [ 215 3741]]


              precision    recall  f1-score   support

           0       0.93      0.93      0.93      3041
           1       0.94      0.95      0.94      3956

   micro avg       0.94      0.94      0.94      6997
   macro avg       0.94      0.94      0.94      6997
weighted avg       0.94      0.94      0.94      6997



In [171]:
# Save our models for model evaluation for next day!
joblib.dump(lr, r"Day 5-6-7\LogisticRegression" )
joblib.dump(svm, r"Day 5-6-7\LinearSVC" )

['Day 5-6-7\\LinearSVC']

In [173]:
# Save y_test, y_pred
joblib.dump(y_test, r"Day 5-6-7\y_test" )
joblib.dump(y_pred_lr, r"Day 5-6-7\y_pred_lr" )
joblib.dump(y_pred_svm, r"Day 5-6-7\y_pred_svm" )

['Day 5-6-7\\y_pred_svm']