### Objective : Classify the news article as fake or real based on the information like headline, date, news body and subject of the news.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
news_true = pd.read_csv(r'C:\Users\AnitaM\Downloads\Fake News Detection\True.csv')
news_fake = pd.read_csv(r'C:\Users\AnitaM\Downloads\Fake News Detection\Fake.csv')

news_true['label'] = 'True'
news_fake['label'] = 'Fake'

news_all = pd.concat([news_true, news_fake], 0)
news_all.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [3]:
news_all.shape

(44898, 5)

In [4]:
# Split the whole dataset into train and test after shuffling the data:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

news_all = shuffle(news_all, random_state=2)
news_train, news_test = train_test_split(news_all, test_size=0.25, random_state=2)

news_train.reset_index(drop=True, inplace=True)
news_test.reset_index(drop=True, inplace=True)

print('Shape of train dataset: ', news_train.shape)
print('Shape of test dataset: ', news_test.shape)

Shape of train dataset:  (33673, 5)
Shape of test dataset:  (11225, 5)


In [5]:
# Grouping some of the catregories together:
for i in range(len(news_train)):
    if news_train['subject'][i] in ['politics', 'politicsNews', 'Government News']:
        news_train.loc[i, 'subject'] = 'politics'
    if news_train['subject'][i] in ['worldnews', 'News']:
        news_train.loc[i, 'subject'] = 'news' 
        
for i in range(len(news_test)):
    if news_test['subject'][i] in ['politics', 'politicsNews', 'Government News']:
        news_test.loc[i, 'subject'] = 'politics'
    if news_test['subject'][i] in ['worldnews', 'News']:
        news_test.loc[i, 'subject'] = 'news' 

In [6]:
# Creating dummies out of the categories in Subject variable:

temp_dummies = pd.get_dummies(news_train['subject'], drop_first=True, prefix='Subject')
news_train = pd.concat([temp_dummies, news_train], 1)
del news_train['subject']

temp_dummies = pd.get_dummies(news_test['subject'], drop_first=True, prefix='Subject')
news_test = pd.concat([temp_dummies, news_test], 1)
del news_test['subject']

In [7]:
# Checking unknown string in date in train dataset:
print(news_train[news_train['date'].str.len() > 19])

       Subject_US_News  Subject_left-news  Subject_news  Subject_politics  \
4040                 0                  1             0                 0   
4517                 0                  0             0                 1   
5244                 0                  0             0                 1   
15473                0                  0             0                 1   
16597                0                  0             0                 1   
17273                0                  1             0                 0   
18466                0                  0             0                 1   
24330                0                  0             0                 1   
25435                0                  0             0                 1   

                                                   title  \
4040                                            Homepage   
4517   https://fedup.wpengine.com/wp-content/uploads/...   
5244   https://100percentfedup.com/video-hillary-

In [8]:
# Checking unknown string in date in test dataset:
print(news_test[news_test['date'].str.len() > 19])

       Subject_US_News  Subject_left-news  Subject_news  Subject_politics  \
10695                0                  1             0                 0   

                                                   title  \
10695  https://fedup.wpengine.com/wp-content/uploads/...   

                                                    text  \
10695  https://fedup.wpengine.com/wp-content/uploads/...   

                                                    date label  
10695  https://fedup.wpengine.com/wp-content/uploads/...  Fake  


In [9]:
# Removing these indexed rows from train and test:

train_labels = news_train[news_train['date'].str.len() > 19].index
print(news_train.shape)
news_train.drop(train_labels, axis=0, inplace=True)
print(news_train.shape)

test_labels = news_test[news_test['date'].str.len() > 19].index
print(news_test.shape)
news_test.drop(test_labels, axis=0, inplace=True)
print(news_test.shape)

(33673, 8)
(33664, 8)
(11225, 8)
(11224, 8)


In [10]:
# Converting date data to datetime and extracting month and weekday information:
news_train['date'] = pd.to_datetime(news_train['date'], infer_datetime_format=True)
news_train['date_month'] = pd.DatetimeIndex(news_train['date']).month
news_train['date_weekday'] = pd.DatetimeIndex(news_train['date']).weekday
del news_train['date']

news_test['date'] = pd.to_datetime(news_test['date'], infer_datetime_format=True)
news_test['date_month'] = pd.DatetimeIndex(news_test['date']).month
news_test['date_weekday'] = pd.DatetimeIndex(news_test['date']).weekday
del news_test['date']

In [11]:
# Encoding labels:
news_train['label'] = np.where(news_train['label']=='True',1,0)
news_test['label'] = np.where(news_test['label']=='True',1,0)

In [12]:
news_train.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday
0,0,0,0,1,Mattis sees opportunity for talks if North Kor...,"COLORADO SPRINGS, Colo. (Reuters) - U.S. Defen...",1,11,3
1,0,1,0,0,WHAT IS GOING ON WITH HILLARY’S EYES…Could It ...,Hillary spoke to a very small group of student...,0,9,2
2,0,0,0,1,"U.S. State Department names Russian defense, i...",WASHINGTON (Reuters) - The U.S. State Departme...,1,10,4
3,0,0,1,0,Roy Moore Supporting Republican LOSES It Afte...,If the scandal around the stories of Alabama S...,0,11,1
4,0,0,1,0,Explainer: Myanmar wields colonial-era law aga...,(Reuters) - Myanmar has accused Reuters report...,1,12,6


In [13]:
news_test.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday
0,0,0,0,1,STUNNING: OBAMA PREACHES Value Of Communism To...,STUNNING! Former US Pres. Obama was at the Int...,0,5,5
1,0,0,0,0,CIA Operative Admits Deep State Globalist Cont...,Jay Dyer 21st Century WireIn the famous treati...,0,9,5
2,0,0,1,0,Parents Ban Black History Month Video From Sc...,Thanks to a bunch of parents who cannot handle...,0,2,3
3,0,0,1,0,Former CIA Chief Blasts Trump’s Speech To Boy...,When Donald Trump addressed the Boy Scouts Na...,0,7,2
4,0,0,1,0,Americans Demand Obama Pick Next Justice: GOP...,Ever since the passing of Supreme Court Justi...,0,2,1


In [14]:
news_train['label'].value_counts()

0    17646
1    16018
Name: label, dtype: int64

In [15]:
news_test['label'].value_counts()

0    5825
1    5399
Name: label, dtype: int64

In [16]:
news_train.reset_index(drop=True, inplace=True)
news_test.reset_index(drop=True, inplace=True)

In [17]:
# Text Summarization of the news article
import re
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import heapq

def text_summarization(news):
    formatted_news = re.sub('[^a-zA-Z]', ' ', news)
    formatted_news = re.sub(r'\s+', ' ', formatted_news)
    
    sentence_list = sent_tokenize(news)
    
    # Preparing word freq table
    stopword = stopwords.words('english')
    word_frequencies = {}
    for word in word_tokenize(formatted_news):
        if word not in stopword:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
    # Calculating weighted frequency
    maximum_frequncy = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
    # Preparing the sentence score table:
    sentence_scores = {}
    for sent in sentence_list:
        for word in word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
                        
    # Pick the top 2 sentence for summary
    summary_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary

In [18]:
# Checking for string with only spaces and no words in train
length = []
for i in range(len(news_train)):
    if news_train['text'][i] in  ['',' ','  ','   ','    ']:
        length.append(len(news_train['text'][i]))
        
print('The max spaces in a string with no words in training dataset is ', max(length))

The max spaces in a string with no words in training dataset is  2


In [19]:
# Checking for string with only spaces and no words in test
length = []
for i in range(len(news_test)):
    if news_test['text'][i] in  ['',' ','  ','   ','    ']:
        length.append(len(news_test['text'][i]))
        
print('The max spaces in a string with no words in test dataset is ', max(length))

The max spaces in a string with no words in test dataset is  2


In [20]:
for i in range(len(news_train)):
    if news_train['text'][i] in [' ', '  ']:
        news_train.loc[i, 'text'] = 'missing'
    
    summary = text_summarization(news_train['text'][i])
    news_train.loc[i, 'text_summary'] = summary
    
news_train.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary
0,0,0,0,1,Mattis sees opportunity for talks if North Kor...,"COLORADO SPRINGS, Colo. (Reuters) - U.S. Defen...",1,11,3,"“So long as they stop testing, stop developing..."
1,0,1,0,0,WHAT IS GOING ON WITH HILLARY’S EYES…Could It ...,Hillary spoke to a very small group of student...,0,9,2,"These are motor symptoms, similar to other mot..."
2,0,0,0,1,"U.S. State Department names Russian defense, i...",WASHINGTON (Reuters) - The U.S. State Departme...,1,10,4,Friday’s action does not itself impose new san...
3,0,0,1,0,Roy Moore Supporting Republican LOSES It Afte...,If the scandal around the stories of Alabama S...,0,11,1,"Whoa, you just called me an extremist, brother..."
4,0,0,1,0,Explainer: Myanmar wields colonial-era law aga...,(Reuters) - Myanmar has accused Reuters report...,1,12,6,The case was widely seen by the domestic media...


In [21]:
for i in range(len(news_test)):
    if news_test['text'][i] in [' ', '  ']:
        news_test.loc[i, 'text'] = 'missing'
    
    summary = text_summarization(news_test['text'][i])
    news_test.loc[i, 'text_summary'] = summary
    
news_test.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary
0,0,0,0,1,STUNNING: OBAMA PREACHES Value Of Communism To...,STUNNING! Former US Pres. Obama was at the Int...,0,5,5,That s not a sustainable mechanism for democra...
1,0,0,0,0,CIA Operative Admits Deep State Globalist Cont...,Jay Dyer 21st Century WireIn the famous treati...,0,9,5,Not only does Copeland detail the various US p...
2,0,0,1,0,Parents Ban Black History Month Video From Sc...,Thanks to a bunch of parents who cannot handle...,0,2,3,Two days after saying the video was one compon...
3,0,0,1,0,Former CIA Chief Blasts Trump’s Speech To Boy...,When Donald Trump addressed the Boy Scouts Na...,0,7,2,"john mclaughlin (@jmclaughlinSAIS) July 25, 20..."
4,0,0,1,0,Americans Demand Obama Pick Next Justice: GOP...,Ever since the passing of Supreme Court Justi...,0,2,1,The voice of the people? I wonder how that s g...


In [22]:
# Extracting word counts in the news article

from nltk.stem.wordnet import WordNetLemmatizer
from string import punctuation
stop_word = set(stopwords.words('english')+list(punctuation))
lemma = WordNetLemmatizer()

def split_into_lemmas(news):
    news=news.lower()
    words = word_tokenize(news)
    words_sans_stop=[]
    for word in words :
        if word in stop_word:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

for i in range(len(news_train)):
    news_train_lemma = split_into_lemmas(news_train['text'][i])
    news_train.loc[i,'word_count_news'] = len(str(news_train_lemma).split())

for i in range(len(news_test)):
    news_test_lemma = split_into_lemmas(news_test['text'][i])
    news_test.loc[i,'word_count_news'] = len(str(news_test_lemma).split())

In [23]:
news_train.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary,word_count_news
0,0,0,0,1,Mattis sees opportunity for talks if North Kor...,"COLORADO SPRINGS, Colo. (Reuters) - U.S. Defen...",1,11,3,"“So long as they stop testing, stop developing...",202.0
1,0,1,0,0,WHAT IS GOING ON WITH HILLARY’S EYES…Could It ...,Hillary spoke to a very small group of student...,0,9,2,"These are motor symptoms, similar to other mot...",185.0
2,0,0,0,1,"U.S. State Department names Russian defense, i...",WASHINGTON (Reuters) - The U.S. State Departme...,1,10,4,Friday’s action does not itself impose new san...,252.0
3,0,0,1,0,Roy Moore Supporting Republican LOSES It Afte...,If the scandal around the stories of Alabama S...,0,11,1,"Whoa, you just called me an extremist, brother...",246.0
4,0,0,1,0,Explainer: Myanmar wields colonial-era law aga...,(Reuters) - Myanmar has accused Reuters report...,1,12,6,The case was widely seen by the domestic media...,607.0


In [24]:
news_test.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary,word_count_news
0,0,0,0,1,STUNNING: OBAMA PREACHES Value Of Communism To...,STUNNING! Former US Pres. Obama was at the Int...,0,5,5,That s not a sustainable mechanism for democra...,164.0
1,0,0,0,0,CIA Operative Admits Deep State Globalist Cont...,Jay Dyer 21st Century WireIn the famous treati...,0,9,5,Not only does Copeland detail the various US p...,112.0
2,0,0,1,0,Parents Ban Black History Month Video From Sc...,Thanks to a bunch of parents who cannot handle...,0,2,3,Two days after saying the video was one compon...,243.0
3,0,0,1,0,Former CIA Chief Blasts Trump’s Speech To Boy...,When Donald Trump addressed the Boy Scouts Na...,0,7,2,"john mclaughlin (@jmclaughlinSAIS) July 25, 20...",200.0
4,0,0,1,0,Americans Demand Obama Pick Next Justice: GOP...,Ever since the passing of Supreme Court Justi...,0,2,1,The voice of the people? I wonder how that s g...,305.0


In [25]:
# Extracting Sentiments of the news articles:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()

# TRAIN:
Sentiment=[]
for news in news_train['text']:
    vs = analyzer.polarity_scores(news) 
    
    if vs['pos'] > vs['neg']:
        pos_neg = 'Positive'
    if vs['neg'] > vs['pos']:
        pos_neg = 'Negative'
    Sentiment.append(pos_neg)
    
news_train['news_sentiment'] = pd.Series(Sentiment)

#TEST:
Sentiment=[]
for news in news_test['text']:
    vs = analyzer.polarity_scores(news) 
    
    if vs['pos'] > vs['neg']:
        pos_neg = 'Positive'
    if vs['neg'] > vs['pos']:
        pos_neg = 'Negative'
    Sentiment.append(pos_neg)
    
news_test['news_sentiment'] = pd.Series(Sentiment)

In [26]:
news_train.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary,word_count_news,news_sentiment
0,0,0,0,1,Mattis sees opportunity for talks if North Kor...,"COLORADO SPRINGS, Colo. (Reuters) - U.S. Defen...",1,11,3,"“So long as they stop testing, stop developing...",202.0,Negative
1,0,1,0,0,WHAT IS GOING ON WITH HILLARY’S EYES…Could It ...,Hillary spoke to a very small group of student...,0,9,2,"These are motor symptoms, similar to other mot...",185.0,Positive
2,0,0,0,1,"U.S. State Department names Russian defense, i...",WASHINGTON (Reuters) - The U.S. State Departme...,1,10,4,Friday’s action does not itself impose new san...,252.0,Positive
3,0,0,1,0,Roy Moore Supporting Republican LOSES It Afte...,If the scandal around the stories of Alabama S...,0,11,1,"Whoa, you just called me an extremist, brother...",246.0,Positive
4,0,0,1,0,Explainer: Myanmar wields colonial-era law aga...,(Reuters) - Myanmar has accused Reuters report...,1,12,6,The case was widely seen by the domestic media...,607.0,Negative


In [27]:
news_test.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,title,text,label,date_month,date_weekday,text_summary,word_count_news,news_sentiment
0,0,0,0,1,STUNNING: OBAMA PREACHES Value Of Communism To...,STUNNING! Former US Pres. Obama was at the Int...,0,5,5,That s not a sustainable mechanism for democra...,164.0,Positive
1,0,0,0,0,CIA Operative Admits Deep State Globalist Cont...,Jay Dyer 21st Century WireIn the famous treati...,0,9,5,Not only does Copeland detail the various US p...,112.0,Positive
2,0,0,1,0,Parents Ban Black History Month Video From Sc...,Thanks to a bunch of parents who cannot handle...,0,2,3,Two days after saying the video was one compon...,243.0,Negative
3,0,0,1,0,Former CIA Chief Blasts Trump’s Speech To Boy...,When Donald Trump addressed the Boy Scouts Na...,0,7,2,"john mclaughlin (@jmclaughlinSAIS) July 25, 20...",200.0,Positive
4,0,0,1,0,Americans Demand Obama Pick Next Justice: GOP...,Ever since the passing of Supreme Court Justi...,0,2,1,The voice of the people? I wonder how that s g...,305.0,Positive


In [28]:
del news_train['text']
del news_test['text']

In [29]:
news_train['news_sentiment'] = np.where(news_train['news_sentiment']=='Positive',1,0)
news_test['news_sentiment'] = np.where(news_test['news_sentiment']=='Positive',1,0)

In [30]:
news_train['title'][0]

'Mattis sees opportunity for talks if North Korea halts missile tests, development'

In [31]:
news_train['text_summary'][0]

'“So long as they stop testing, stop developing, they don’t export their weapons, there would be opportunity for talks,” Mattis told reporters on a military plane. Mattis said the U.S. military was closely watching why there had been no missile launches recently, but declined to give potential reasons for it.'

In [32]:
news_train['title'][1]

'WHAT IS GOING ON WITH HILLARY’S EYES…Could It Be Related To Parkinson’s Disease? [VIDEO]'

In [33]:
news_train['text_summary'][1]

'These are motor symptoms, similar to other motor symptoms caused by loss of dopamine neurons. She doesn t have enough faith in these young adults to consider they may care about our national security or about getting a good paying job out of college.'

In [34]:
news_train['title'][2]

'U.S. State Department names Russian defense, intel firms under sanctions law'

In [35]:
news_train['text_summary'][2]

'Friday’s action does not itself impose new sanctions, and determinations will be made on a case by case basis, State Department officials said. The official added that individuals throughout the department work on sanctions and that coordination of such work would continue.'

In [36]:
# The title column can be dropped as most of the information is conatined in text_summary

In [37]:
del news_train['title']
del news_test['title']

In [38]:
# TFIDF vectorizer from text_summary:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(analyzer=split_into_lemmas,min_df=50,max_df=1000)

In [39]:
tfidf.fit(news_train['text_summary'])

news_train_text_tf = tfidf.transform(news_train['text_summary'])
news_train_text = pd.DataFrame(news_train_text_tf.toarray(),columns=tfidf.get_feature_names())
#news_train = pd.concat([news_train, news_train_text],1)

news_test_text_tf = tfidf.transform(news_test['text_summary'])
news_test_text = pd.DataFrame(news_test_text_tf.toarray(),columns=tfidf.get_feature_names())
#news_test = pd.concat([news_test, news_test_text],1)

In [40]:
news_train_text.head()

Unnamed: 0,'','s,...,//www.youtube.com/watch,1,10,100,11,12,13,....1,yes,yesterday,yet,york,young,youth,youtube,zero,zone,‘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.248137,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
news_test_text.head()

Unnamed: 0,'','s,...,//www.youtube.com/watch,1,10,100,11,12,13,....1,yes,yesterday,yet,york,young,youth,youtube,zero,zone,‘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
news_train['weightd_freq_text'] = news_train_text.sum(axis=1, skipna=True)
news_test['weightd_freq_text'] = news_test_text.sum(axis=1, skipna=True)

In [43]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)

X_20d_train = pca.fit_transform(news_train_text)
news_train_text_transformed_pca = pd.DataFrame(X_20d_train,
                                                columns=['text_x1','text_x2', 'text_x3', 'text_x4', 'text_x5', 'text_x6',
                                                          'text_x7', 'text_x8', 'text_x9', 'text_x10', 'text_x11', 'text_x12',
                                                          'text_x13', 'text_x14', 'text_x15', 'text_x16', 'text_x17', 'text_x18',
                                                          'text_x19', 'text_x20'])

X_20d_test = pca.transform(news_test_text)
news_test_text_transformed_pca = pd.DataFrame(X_20d_test,
                                                columns=['text_x1','text_x2', 'text_x3', 'text_x4', 'text_x5', 'text_x6',
                                                          'text_x7', 'text_x8', 'text_x9', 'text_x10', 'text_x11', 'text_x12',
                                                          'text_x13', 'text_x14', 'text_x15', 'text_x16', 'text_x17', 'text_x18',
                                                          'text_x19', 'text_x20'])

In [55]:
news_train_text_transformed_pca.head()

Unnamed: 0,text_x1,text_x2,text_x3,text_x4,text_x5,text_x6,text_x7,text_x8,text_x9,text_x10,text_x11,text_x12,text_x13,text_x14,text_x15,text_x16,text_x17,text_x18,text_x19,text_x20
0,-0.016321,-0.044955,-0.073709,-0.036729,0.079235,-0.033372,-0.032506,-0.038867,-0.007244,-0.048434,0.049756,0.050488,0.019743,-0.022538,-0.019611,-0.025134,-0.042812,-0.002161,-0.015516,0.004897
1,-0.014871,-0.000699,0.03282,0.012852,-0.026225,-0.040055,0.022538,0.01715,-0.018531,0.023637,-0.017559,-0.016894,0.02898,-0.000931,0.032529,-0.038507,0.008917,-0.013231,-0.001981,0.060931
2,-0.018107,-0.027917,-0.093422,-0.01067,0.040288,0.031074,0.009666,0.042675,0.005131,-0.013896,-0.043322,-0.14368,-0.038998,-0.020815,0.029749,0.001824,-0.021175,-0.031822,0.006282,-0.038479
3,-0.015097,-0.034418,-0.034557,-0.026493,0.007258,-0.000927,-0.093397,-0.040584,-0.019567,-0.028006,0.066473,0.068785,0.000812,-0.022197,-0.008021,-0.025564,-0.020232,0.013315,0.009968,-0.014436
4,-0.0157,-0.021944,-0.045663,-0.027192,0.00405,0.025788,-0.099261,-0.009229,-0.000624,-0.033457,0.051098,0.01613,-0.005625,-0.051747,-0.011942,-0.070104,-0.060138,0.017566,-0.030539,-0.011306


In [62]:
# Use MinMaxScaler to range the values between 0-1 for all the text_ features:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(news_train_text_transformed_pca)
scaled_train = scaler.transform(news_train_text_transformed_pca)

for i,col in enumerate(news_train_text_transformed_pca.columns):
    news_train_text_transformed_pca[col] = scaled_train[:,i]
    
scaled_test = scaler.transform(news_test_text_transformed_pca)
for i,col in enumerate(news_test_text_transformed_pca.columns):
    news_test_text_transformed_pca[col] = scaled_test[:,i]
    
news_train_text_transformed_pca.head()

Unnamed: 0,text_x1,text_x2,text_x3,text_x4,text_x5,text_x6,text_x7,text_x8,text_x9,text_x10,text_x11,text_x12,text_x13,text_x14,text_x15,text_x16,text_x17,text_x18,text_x19,text_x20
0,0.0078,0.089789,0.311761,0.387005,0.300588,0.419643,0.429229,0.380468,0.148686,0.348638,0.533994,0.504144,0.524542,0.293069,0.431075,0.317689,0.176894,0.490581,0.443889,0.489226
1,0.00923,0.143865,0.446547,0.436969,0.114594,0.407513,0.523858,0.458273,0.137124,0.47366,0.436453,0.389872,0.542215,0.318944,0.511205,0.292578,0.253118,0.473702,0.468868,0.589922
2,0.006038,0.110608,0.286819,0.413265,0.2319,0.536601,0.501729,0.493726,0.161364,0.408551,0.399121,0.174857,0.412159,0.295132,0.506933,0.368308,0.208775,0.445357,0.484118,0.411279
3,0.009007,0.102664,0.361299,0.39732,0.173647,0.478524,0.324548,0.378083,0.136062,0.384076,0.558217,0.535173,0.488323,0.293477,0.448887,0.316882,0.210165,0.514178,0.490919,0.454485
4,0.008412,0.117906,0.347246,0.396615,0.167989,0.527008,0.314466,0.421634,0.155468,0.374619,0.535937,0.445877,0.47601,0.258089,0.442862,0.23325,0.151363,0.520659,0.416165,0.46011


In [63]:
news_train = pd.concat([news_train, news_train_text_transformed_pca], 1)
news_test = pd.concat([news_test, news_test_text_transformed_pca], 1)

In [64]:
news_train.head()

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,label,date_month,date_weekday,word_count_news,news_sentiment,weightd_freq_text,...,text_x11,text_x12,text_x13,text_x14,text_x15,text_x16,text_x17,text_x18,text_x19,text_x20
0,0,0,0,1,1,11,3,202.0,0,4.144833,...,0.533994,0.504144,0.524542,0.293069,0.431075,0.317689,0.176894,0.490581,0.443889,0.489226
1,0,1,0,0,0,9,2,185.0,1,3.844061,...,0.436453,0.389872,0.542215,0.318944,0.511205,0.292578,0.253118,0.473702,0.468868,0.589922
2,0,0,0,1,1,10,4,252.0,1,3.333541,...,0.399121,0.174857,0.412159,0.295132,0.506933,0.368308,0.208775,0.445357,0.484118,0.411279
3,0,0,1,0,0,11,1,246.0,1,3.639238,...,0.558217,0.535173,0.488323,0.293477,0.448887,0.316882,0.210165,0.514178,0.490919,0.454485
4,0,0,1,0,1,12,6,607.0,0,4.422877,...,0.535937,0.445877,0.47601,0.258089,0.442862,0.23325,0.151363,0.520659,0.416165,0.46011


In [47]:
del news_train['text_summary']
del news_test['text_summary']

In [65]:
news_train.describe(include='all')

Unnamed: 0,Subject_US_News,Subject_left-news,Subject_news,Subject_politics,label,date_month,date_weekday,word_count_news,news_sentiment,weightd_freq_text,...,text_x11,text_x12,text_x13,text_x14,text_x15,text_x16,text_x17,text_x18,text_x19,text_x20
count,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,...,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0,33664.0
mean,0.017526,0.099364,0.427608,0.437025,0.47582,7.204135,2.719077,237.87194,0.502079,3.230951,...,0.461896,0.418522,0.48677,0.320059,0.461214,0.364883,0.239978,0.493876,0.472524,0.480427
std,0.131223,0.299155,0.494739,0.496026,0.499422,3.529057,1.854256,206.361039,0.500003,1.021219,...,0.065415,0.075215,0.083258,0.05116,0.064877,0.078642,0.060686,0.06261,0.073991,0.071241
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,4.0,1.0,120.0,0.0,2.944475,...,0.436758,0.381843,0.447186,0.296675,0.430106,0.325717,0.20959,0.468921,0.434297,0.443957
50%,0.0,0.0,0.0,0.0,0.0,8.0,3.0,208.0,1.0,3.440293,...,0.462597,0.415117,0.481857,0.314329,0.455469,0.364657,0.229249,0.495693,0.472033,0.472706
75%,0.0,0.0,1.0,1.0,1.0,10.0,4.0,297.0,1.0,3.854031,...,0.486887,0.446722,0.525193,0.336681,0.487899,0.394861,0.258536,0.517929,0.505312,0.512833
max,1.0,1.0,1.0,1.0,1.0,12.0,6.0,4932.0,1.0,5.423293,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [66]:
news_train.isnull().sum()

Subject_US_News      0
Subject_left-news    0
Subject_news         0
Subject_politics     0
label                0
date_month           0
date_weekday         0
word_count_news      0
news_sentiment       0
weightd_freq_text    0
text_x1              0
text_x2              0
text_x3              0
text_x4              0
text_x5              0
text_x6              0
text_x7              0
text_x8              0
text_x9              0
text_x10             0
text_x11             0
text_x12             0
text_x13             0
text_x14             0
text_x15             0
text_x16             0
text_x17             0
text_x18             0
text_x19             0
text_x20             0
dtype: int64

In [67]:
news_test.isnull().sum()

Subject_US_News      0
Subject_left-news    0
Subject_news         0
Subject_politics     0
label                0
date_month           0
date_weekday         0
word_count_news      0
news_sentiment       0
weightd_freq_text    0
text_x1              0
text_x2              0
text_x3              0
text_x4              0
text_x5              0
text_x6              0
text_x7              0
text_x8              0
text_x9              0
text_x10             0
text_x11             0
text_x12             0
text_x13             0
text_x14             0
text_x15             0
text_x16             0
text_x17             0
text_x18             0
text_x19             0
text_x20             0
dtype: int64

In [68]:
x_train = news_train.drop('label', 1)
y_train = news_train['label']

x_test = news_test.drop('label', 1)
y_test = news_test['label']

In [70]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(33664, 29)
(33664,)
(11224, 29)
(11224,)


In [69]:
# Training diff models:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
# Performance of some models when all the features were taken together:
#Logistic Regression
#Accuray :  0.8296104001986354
#Decision Tree
#Accuray :  0.8528398910751853
#Random Forest
#Accuray :  0.9197655903538257
#ExtraTrees
#Accuray :  0.9241324123677066

In [71]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), XGBClassifier(), 
          SVC(), KNeighborsClassifier(), MultinomialNB()]

names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'ExtraTrees', 'XGB', 'SVM', 'KNN', 'NB']
for model, name in zip(models, names):
    print(name)
    print('Accuray : ', cross_val_score(model, x_train, y_train, scoring='accuracy', cv=10).mean())

Logistic Regression
Accuray :  0.7804770657711834
Decision Tree
Accuray :  0.8657908510849689
Random Forest
Accuray :  0.9121014738661797
ExtraTrees
Accuray :  0.9092797063385298
XGB
Accuray :  0.9131711867005985
SVM
Accuray :  0.6665874930580813
KNN
Accuray :  0.7727841139605844
NB
Accuray :  0.646418093476917


In [72]:
# The preformance score for the models when some features were used was almost same as when all the features were used.

In [73]:
# XGB and RF models performed best among others. Checking their performance on test data:

print('XGB Model----------------------------------------')
model = XGBClassifier().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('RF Model----------------------------------------')
model = RandomForestClassifier().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)))

XGB Model----------------------------------------
Train accuracy :  0.966254752851711
Test accuracy :  0.9089451176051319 

RF Model----------------------------------------
Train accuracy :  0.9997920627376425
Test accuracy :  0.9098360655737705


In [74]:
# Checkng other model's performance on test data:

print('LR Model----------------------------------------')
model = LogisticRegression().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('DT Model----------------------------------------')
model = DecisionTreeClassifier().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('ExtraTree Model----------------------------------------')
model = ExtraTreesClassifier().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('SVM Model----------------------------------------')
model = SVC().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('KNN Model----------------------------------------')
model = KNeighborsClassifier().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

print('NB Model----------------------------------------')
model = MultinomialNB().fit(x_train, y_train)
print('Train accuracy : ', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, model.predict(x_test)), '\n')

LR Model----------------------------------------
Train accuracy :  0.801509030418251
Test accuracy :  0.8011404133998574 

DT Model----------------------------------------
Train accuracy :  0.9997920627376425
Test accuracy :  0.862615823235923 

ExtraTree Model----------------------------------------
Train accuracy :  0.9997920627376425
Test accuracy :  0.9092124019957234 

SVM Model----------------------------------------
Train accuracy :  0.6707164923954373
Test accuracy :  0.6712401995723449 

KNN Model----------------------------------------
Train accuracy :  0.8569094581749049
Test accuracy :  0.7757483962936564 

NB Model----------------------------------------
Train accuracy :  0.6463878326996197
Test accuracy :  0.6479864575908767 



In [76]:
# Hyper tune XGB and RF model
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [77]:
xgb_params = {'learning_rate' : [0.01,0.05,0.1,0.2,0.3],
              'gamma' : [i/10.0 for i in range(0,5)],
              'max_depth' : [2,3,4,5,6,7,8],
              'min_child_weight' : [1,2,5],
              'subsample' : [i/10.0 for i in range(5,10)],
              'colsample_bytree' : [i/10.0 for i in range(5,10)],
              'colsample_bylevel' : [i/10.0 for i in range(5,10)],
              'reg_lambda' : [1e-5, 1e-2, 0.1, 1, 100], 
              'reg_alpha' : [1e-5, 1e-2, 0.1, 1, 100],
              'n_estimator' : [100,500,700]
             }

In [78]:
random_search = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'),
                                   cv=10,
                                   scoring='accuracy',
                                   random_state=2,
                                   param_distributions=xgb_params)

In [79]:
random_search.fit(x_train, y_train)

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100...
                   param_distributions={'colsample_bylevel': [0.5, 0.6, 0.7,
                                                              0.8, 0.9],
                               

In [80]:
xgb_model = random_search.best_estimator_

In [81]:
print('Train accuracy : ', accuracy_score(y_train, xgb_model.predict(x_train)), '\n')
print('Test accuracy : ', accuracy_score(y_test, xgb_model.predict(x_test)))

Train accuracy :  0.9304895437262357 

Test accuracy :  0.9038667141838916


In [82]:
rf_params = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,20)),
    'n_estimators': hp.choice('n_estimators', [100,300,500,700]),
    'criterion': hp.choice('criterion', ['gini', 'entropy'])}

In [83]:
def acc_model(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, x_train, y_train).mean()

best = 0
def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    print ('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, rf_params, algo=tpe.suggest, max_evals=10, trials=trials)
print ('best:')
print (best)

new best:                                             
0.882961012965556                                     
{'criterion': 'gini', 'max_depth': 8, 'max_features': 4, 'n_estimators': 500}
new best:                                                                        
0.8981997705715828                                                               
{'criterion': 'entropy', 'max_depth': 11, 'max_features': 6, 'n_estimators': 300}
new best:                                                                           
0.8981997705715828                                                                  
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 3, 'n_estimators': 500}   
new best:                                                                           
0.9107948550072942                                                                
{'criterion': 'gini', 'max_depth': 16, 'max_features': 15, 'n_estimators': 700}   
new best:                                                   

In [84]:
rf_model = RandomForestClassifier(**{'criterion': 'entropy', 'max_depth': 17, 'max_features': 14, 'n_estimators': 300})
rf_model.fit(x_train, y_train)

print('Train accuracy : ', accuracy_score(y_train, rf_model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, rf_model.predict(x_test)))

Train accuracy :  0.9945639258555133
Test accuracy :  0.9081432644333571


In [85]:
rf_sample = RandomForestClassifier(**{'criterion': 'entropy', 'max_depth': 17, 'max_features': 14, 'n_estimators': 3})
rf_sample.fit(x_train, y_train)

print('Train accuracy : ', accuracy_score(y_train, rf_model.predict(x_train)))
print('Test accuracy : ', accuracy_score(y_test, rf_model.predict(x_test)))

Train accuracy :  0.9945639258555133
Test accuracy :  0.9081432644333571


In [87]:
# Saving xgb as the final model as a pickle file:
import sklearn.externals 
import joblib

joblib.dump(xgb_model,'FakeNews_Pred.pkl')

['FakeNews_Pred.pkl']

In [None]:
# from sklearn.externals import joblib would have worked for sklearn versions below 0.23
# for versions above 0.23 use: 
# import sklearn.externals 
# import joblib