In [14]:
#let's import important libraries 
import pandas as pd
import numpy as np
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
#we'll download stopwords from the library
import nltk 
nltk.download ('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
#let's print stopwords in English 
print(stopwords.words ('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Data Preprocessing

In [17]:
#let's load the dataset into pandas dataframe 
news_df=pd.read_csv('train.csv')

In [18]:
#we'll view the first 5 rows on the dataframe 
news_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Details about the dataset:

- id: unique id for a news article
- title: the title of a news article
- author: author of the news article
- text: the text of the articles, this could be incomplete
- label: a label makes whether the news article is real or fake --> 1 - fake news, while 0-real news

In [19]:
#let's view the number of rows and columns we have in the dataset
news_df.shape

(20800, 5)

In [20]:
#let's check for missing values in the dataset
news_df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

- we'll drop the missing values, since we have large dataset ( if the numbers of missing values are less than 5% of the dataset, we'll drop the missing values)

In [21]:
#we'll replace the null values with empty string
news_df=news_df.fillna('')

In [22]:
#let's check if changes is effected
news_df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

- we'll need just the title and author columns, because the text column are so large ( this will take a lot of time to process)


In [23]:
#merging the authors' name and news title 
news_df['content']=news_df['author']+ '' +news_df['title']

In [24]:
#let's print our dataset
print(news_df['content'])

0        Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1        Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom...
2        Consortiumnews.comWhy the Truth Might Get You ...
3        Jessica Purkiss15 Civilians Killed In Single U...
4        Howard PortnoyIranian woman jailed for fiction...
                               ...                        
20795    Jerome HudsonRapper T.I.: Trump a ’Poster Chil...
20796    Benjamin HoffmanN.F.L. Playoffs: Schedule, Mat...
20797    Michael J. de la Merced and Rachel AbramsMacy’...
20798    Alex AnsaryNATO, Russia To Hold Parallel Exerc...
20799               David SwansonWhat Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [27]:
#we'll separate the content and label from the dataset
X=news_df['content']
y=news_df['label']

print(X)
print(y)

0        Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1        Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom...
2        Consortiumnews.comWhy the Truth Might Get You ...
3        Jessica Purkiss15 Civilians Killed In Single U...
4        Howard PortnoyIranian woman jailed for fiction...
                               ...                        
20795    Jerome HudsonRapper T.I.: Trump a ’Poster Chil...
20796    Benjamin HoffmanN.F.L. Playoffs: Schedule, Mat...
20797    Michael J. de la Merced and Rachel AbramsMacy’...
20798    Alex AnsaryNATO, Russia To Hold Parallel Exerc...
20799               David SwansonWhat Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object
0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


- Stemming procedure: this is the process the words to its root word example: actor, actions, acting --> act

In [28]:
port_stem=PorterStemmer()

def stemming(text):
    stemmed_text = re.sub('[^a-zA-Z]',' ',text)
    stemmed_text = stemmed_text.lower()
    stemmed_text = stemmed_text.split()
    stemmed_text = [port_stem.stem(word) for word in stemmed_text if word not in stopwords.words('english')]
    stemmed_text = ' '.join(stemmed_text)
    return stemmed_text

In [29]:
#let's apply the function to the content column
news_df['content']=news_df['content'].apply(stemming)

In [30]:
#let's print the content dataset after cleaning 
print(news_df['content'])

0        darrel lucushous dem aid even see comey letter...
1        daniel j flynnflynn hillari clinton big woman ...
2                consortiumnew comwhi truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoyiranian woman jail fiction unpub...
                               ...                        
20795    jerom hudsonrapp trump poster child white supr...
20796    benjamin hoffmann f l playoff schedul matchup ...
20797    michael j de la merc rachel abramsmaci said re...
20798    alex ansarynato russia hold parallel exercis b...
20799                        david swansonwhat keep f aliv
Name: content, Length: 20800, dtype: object


In [31]:
#splitting the dataset to training and test dataset
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [32]:
#we'll convert the textual data to numerical data
vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec=vectorizer.fit_transform(X_train)
X_test_vec=vectorizer.transform(X_test)

print(X_train_vec)
print(X_test_vec)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 157278 stored elements and shape (16640, 31546)>
  Coords	Values
  (0, 30487)	0.2809082866026443
  (0, 1125)	0.37747235539485413
  (0, 18426)	0.3622075859450017
  (0, 4775)	0.2947522250263501
  (0, 11754)	0.37747235539485413
  (0, 13490)	0.37747235539485413
  (0, 27722)	0.37747235539485413
  (0, 11370)	0.3361122902106021
  (0, 18982)	0.08043236574424176
  (0, 30574)	0.08271341282869449
  (0, 27970)	0.0824224662511298
  (1, 18982)	0.09593275008811623
  (1, 30574)	0.09865338521885272
  (1, 27970)	0.09830636937446915
  (1, 9113)	0.36283959629451457
  (1, 8495)	0.45021628793580015
  (1, 11349)	0.31920039613778195
  (1, 29175)	0.41909208238745277
  (1, 4932)	0.37366843270522815
  (1, 485)	0.3180029188529017
  (1, 7930)	0.3446331031839939
  (2, 9674)	0.48766149105194617
  (2, 13698)	0.40938687238354826
  (2, 6324)	0.5697580925545817
  (2, 19799)	0.5195809608074315
  :	:
  (16637, 26988)	0.26391277360649096
  (16637, 7251)	0.300873

### Training the model

In [33]:
#we'll train our model using LoLogisticRegression, which is best for binary classification
model=LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

In [34]:
#evaluation (using accuracy score)
#accuracy_score on the training data 
X_train_prediction=model.predict(X_train_vec)
training_data_accuracy=accuracy_score(X_train_prediction, y_train)

print('Accuracy score of training data: ', training_data_accuracy)

Accuracy score of training data:  0.9731370192307692


In [35]:
#accuracy_score on the test data 
X_test_prediction=model.predict(X_test_vec)
test_data_accuracy=accuracy_score(X_test_prediction, y_test)

print('Accuracy score of test data: ', test_data_accuracy)

Accuracy score of test data:  0.9528846153846153


In [39]:
#we'll save the vectorizer and model with pickle
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('fake_news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!
