In [31]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
!pip install xgboost
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

!pip install keras
!pip install tensorflow
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import load_model



seed = 4353




In [9]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [10]:
#############
true.sample(5)
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [11]:
# Introducing new column in both dataframes

true['impression']=1
fake['impression']=0


In [12]:
# Concatenating them using pandas concatenate to form a single dataframe

data_raw = pd.concat([true, fake], axis=0)
data_raw.sample(10)

Unnamed: 0,title,text,subject,date,impression
22160,"Trump, Liberal Hypocrisy & Humanity’s Future",21st Century Wire says Here s an epic discussi...,US_News,"March 19, 2017",0
13804,Queensland result leaves Australian PM closer ...,SYDNEY (Reuters) - The loss of a state electio...,worldnews,"November 27, 2017",1
1039,Manhattan U.S. attorney adds to probes of ex-T...,WASHINGTON (Reuters) - The U.S. attorney’s off...,politicsNews,"October 25, 2017",1
2501,Prominent Psychiatrist Gives One DAMNING Reas...,Given Donald Trump s disastrous presidential c...,News,"February 16, 2017",0
16039,HEATED! MARIA BARTIROMO Goes At It With John P...,John Podesta is the guardian of the Clintons j...,Government News,"Jun 29, 2017",0
10384,Group ends effort to draft House Speaker Ryan ...,WASHINGTON (Reuters) - A group that wanted to ...,politicsNews,"March 11, 2016",1
13696,Indonesian 'Trump' says has no plans to run fo...,SINGAPORE (Reuters) - Indonesian business tyco...,worldnews,"November 28, 2017",1
1549,There’s Something Really Creepy About Melania...,Handwriting experts have noticed something ver...,News,"May 9, 2017",0
20544,RELIGION OF PROGRESSIVISM: Meet Obama’s NEW Tr...,The religion of Progressivism is working overt...,left-news,"May 21, 2016",0
12145,GERMANY’S DEFENSE MINISTER Refuses To Wear Hij...,Germany s defense minister refused to wear a t...,politics,"Dec 14, 2016",0


In [13]:
# Combining title and text to obtain a single string
# dropping title and

data_raw['fulltext'] = data_raw.title + ' ' + data_raw.text
data_raw.drop(['title','text'], axis=1, inplace=True)

In [14]:
# Extracting a new dataframe using features fulltext and impression
data = data_raw[['fulltext', 'impression']]
data = data.reset_index()
data.drop(['index'], axis=1, inplace=True)

In [15]:
# Check for missing values

data.isnull().sum()

fulltext      0
impression    0
dtype: int64

In [16]:
print('The dataset contans {} rows and {} columns'.format(data.shape[0], data.shape[1]))


The dataset contans 44898 rows and 2 columns


In [17]:
# Word extraction from true and fake texts

true_text = data[data.impression==1]['fulltext']
fake_text = data[data.impression==0]['fulltext']
fake_text = fake_text.reset_index().drop(['index'], axis=1)


In [18]:
# Function to extract major words from true and fake news

def wordcloud_words(X_data_full):
    
    # function for removing punctuations
    def remove_punct(X_data_func):
        string1 = X_data_func.lower()
        translation_table = dict.fromkeys(map(ord, string.punctuation),' ')
        string2 = string1.translate(translation_table)
        return string2
  
    X_data_full_clear_punct = []
    for i in range(len(X_data_full)):
        test_data = remove_punct(X_data_full[i])
        X_data_full_clear_punct.append(test_data)
        
        # function to remove stopwords
    def remove_stopwords(X_data_func):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        string2 = pattern.sub(' ', X_data_func)
        return string2
    
    X_data_full_clear_stopwords = []
    for i in range(len(X_data_full)):
        test_data = remove_stopwords(X_data_full[i])
        X_data_full_clear_stopwords.append(test_data)
        
    # function for tokenizing
    def tokenize_words(X_data_func):
        words = nltk.word_tokenize(X_data_func)
        return words
    
    X_data_full_tokenized_words = []
    for i in range(len(X_data_full)):
        test_data = tokenize_words(X_data_full[i])
        X_data_full_tokenized_words.append(test_data)
        
    # function for lemmatizing
    lemmatizer = WordNetLemmatizer()
    def lemmatize_words(X_data_func):
        words = lemmatizer.lemmatize(X_data_func)
        return words
    
    X_data_full_lemmatized_words = []
    for i in range(len(X_data_full)):
        test_data = lemmatize_words(X_data_full[i])
        X_data_full_lemmatized_words.append(test_data)
        
    return X_data_full_lemmatized_words

In [35]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

true_words = wordcloud_words(true_text)
fake_words = wordcloud_words(fake_text.fulltext)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [24]:
###### ML Models
# Data preparation

X_data = data['fulltext']
y_data = data.impression
X_data = X_data.astype(str)

In [25]:
# Function to retrieve processed words

def final(X_data_full):
    
    # function for removing punctuations
    def remove_punct(X_data_func):
        string1 = X_data_func.lower()
        translation_table = dict.fromkeys(map(ord, string.punctuation),' ')
        string2 = string1.translate(translation_table)
        return string2
    
    X_data_full_clear_punct = []
    for i in range(len(X_data_full)):
        test_data = remove_punct(X_data_full[i])
        X_data_full_clear_punct.append(test_data)
        
    # function to remove stopwords
    def remove_stopwords(X_data_func):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        string2 = pattern.sub(' ', X_data_func)
        return string2
    
    X_data_full_clear_stopwords = []
    for i in range(len(X_data_full)):
        test_data = remove_stopwords(X_data_full[i])
        X_data_full_clear_stopwords.append(test_data)
        
    # function for tokenizing
    def tokenize_words(X_data_func):
        words = nltk.word_tokenize(X_data_func)
        return words
    
    X_data_full_tokenized_words = []
    for i in range(len(X_data_full)):
        test_data = tokenize_words(X_data_full[i])
        X_data_full_tokenized_words.append(test_data)
        
    # function for lemmatizing
    lemmatizer = WordNetLemmatizer()
    def lemmatize_words(X_data_func):
        words = lemmatizer.lemmatize(X_data_func)
        return words
    
    X_data_full_lemmatized_words = []
    for i in range(len(X_data_full)):
        test_data = lemmatize_words(X_data_full[i])
        X_data_full_lemmatized_words.append(test_data)
        
    # creating the bag of words model
    cv = CountVectorizer(max_features=1000)
    X_data_full_vector = cv.fit_transform(X_data_full_lemmatized_words).toarray()
    
    
    tfidf = TfidfTransformer()
    X_data_full_tfidf = tfidf.fit_transform(X_data_full_vector).toarray()
    
    return X_data_full_tfidf

In [36]:
# Setting the function with parameters


data_X = final(X_data)

In [37]:
# Preparing training and testing data using train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_X, y_data, test_size=0.25, random_state= seed)

In [38]:
##Random Forest
# Instatiation, fitting and prediction

rfc=RandomForestClassifier(n_estimators= 10, random_state= seed)
rfc.fit(X_train, y_train)
predictions = rfc.predict(X_test)

In [40]:
# Model evaluation

print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5849
           1       1.00      0.99      0.99      5376

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

