In [21]:
# Let's use machine learning to create a model to identify when an article might be fake news.
# We will use logistic Regression.
# The data comes from Kaggle page (https://www.kaggle.com/c/fake-news#).

In [22]:
# let’s import the libraries we are going to use.
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

In [23]:
# You can view the list of included stop words in NLTK using the code below:
stops = set(stopwords.words('english'))
print('Words:')
print(stops)
print('Total number of words: ', len(stops))
# You can do that for different languages, so you can configure for the language you need.
# stops = set(stopwords.words('german'))
# stops = set(stopwords.words('indonesia'))
# stops = set(stopwords.words('portuguese'))
# stops = set(stopwords.words('spanish'))

Words:
{'further', 'few', 'you', 'until', 'whom', 'what', "needn't", 'than', 'up', 'being', 'some', 'can', 'hers', 's', 'ourselves', 'because', 'off', 'down', 'of', 'which', 'were', 'isn', 'and', 'about', 'on', 're', 'itself', 'when', 'other', 'shouldn', 'here', 'their', 'too', "haven't", 'does', "wasn't", 'he', 'in', 'haven', 'didn', 'no', 'did', 'between', "she's", 'her', 'more', 'it', 'ma', 'yourselves', 'weren', 'me', 'if', "didn't", 'an', 'its', 'having', 'him', 'doing', 'both', "wouldn't", 'i', 'there', 'once', 'as', 'for', "won't", "weren't", 'just', "hasn't", 'a', "couldn't", 'only', "aren't", 'from', 'won', 'mightn', 'very', 'ours', "mustn't", 'my', 'over', 'after', 'd', 'any', 'm', "doesn't", 'yours', 'yourself', 'such', 'with', 'himself', 'out', 'during', 'they', 'why', 'or', 'll', 'all', 'now', "you'd", 'do', 'y', "you'll", 'been', 'under', 'she', "should've", 'not', 'again', 'these', 'have', 'those', 'against', "shan't", 'above', "that'll", 'o', 'shan', 'was', "you've", 'm

In [24]:
# Loading the dataset using pandas
df = pd.read_csv('train.csv')

In [25]:
# Summarize the Dataset
print('df')
print('Head')
print(df.head())
print('Tail')
print(df.tail())
print('Shape:', df.shape)

df
Head
   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  
Tail
          id                                              title  \
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   


In [26]:
# We see if there is missing data
print("Missing data")
print(df.isnull().sum())
# In percentage
print("Missing data in percentage")
print(round(df.isnull().sum()/df.shape[0]*100,2))

Missing data
id           0
title      558
author    1957
text        39
label        0
dtype: int64
Missing data in percentage
id        0.00
title     2.68
author    9.41
text      0.19
label     0.00
dtype: float64


In [27]:
# Filling missing values with empty string
df=df.fillna('')
print("Missing data")
print(df.isnull().sum())
# In percentage
print("Missing data in percentage")
print(round(df.isnull().sum()/df.shape[0]*100,2))

Missing data
id        0
title     0
author    0
text      0
label     0
dtype: int64
Missing data in percentage
id        0.0
title     0.0
author    0.0
text      0.0
label     0.0
dtype: float64


In [28]:
# We will be only using title and author name for prediction (we can also use the text column but it will take much time later).
df['total'] = df['author'] + ' ' + df['title']
print(df)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [29]:
# Reducing a word to its root word
port_stem = PorterStemmer()

In [30]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [31]:
# Applying stemming to the "total" column of the dataframe (this will take some time)
df['total'] = df['total'].apply(stemming)

In [32]:
# Separating the data and label
X = df['total'].values
Y = df['label'].values

print(X.shape)
print(Y.shape)

(20800,)
(20800,)


In [33]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer() # Term Frequency - Inverse Document Frequency.
vectorizer.fit(X)

X = vectorizer.transform(X)

In [34]:
X.shape

(20800, 17128)

In [35]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [36]:
# We split out df into training data for preparing the models and testing data that we will use for testing them.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [37]:
# Logistic Regression and metrics
Logistic = LogisticRegression()
Logistic.fit(X_train, Y_train)
Y_pred = Logistic.predict(X_test)
print('Logistic Regression:')
print('Train set', round(Logistic.score(X_train, Y_train), 4))
print('Test set', round(Logistic.score(X_test, Y_test), 4))

# Metrics
matrix = confusion_matrix(Y_test, Y_pred) 
print('Confusion Matrix:') # Confusion Matrix [TN FP] ; [FN, TP]
print(matrix)

accuracy = accuracy_score(Y_test, Y_pred) # Accuracy: TN+TP / TN+TP+FN+FP
print('Accuracy:', round(accuracy, 4))

precision = precision_score(Y_test, Y_pred) # Precision: TP / TP+FP
print('Precision:', round(precision,4))

recall = recall_score(Y_test, Y_pred) # Recall: TP / TP+FN
print('Recall:', round(recall, 4))

f1 = f1_score(Y_test, Y_pred) # F1 Score (precision+recall): 2TP / 2TP+FN+FP
print('F1 Score:', round(f1, 4))

classificationRep = classification_report(Y_test, Y_pred) # Classification Report
print('Classification Report:')
print(classificationRep)


Logistic Regression:
Train set 0.9866
Test set 0.9791
Confusion Matrix:
[[2004   73]
 [  14 2069]]
Accuracy: 0.9791
Precision: 0.9659
Recall: 0.9933
F1 Score: 0.9794
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      2077
           1       0.97      0.99      0.98      2083

    accuracy                           0.98      4160
   macro avg       0.98      0.98      0.98      4160
weighted avg       0.98      0.98      0.98      4160

