In [1]:
import numpy as np                           ## For numerical operations
import pandas as pd                          ## For data manipulation
import matplotlib.pyplot as plt              ## For plotting
import nltk
import re                                      ## For regular expressions
from  nltk.corpus import stopwords             ## Stopwords: words that are not important in the text
from nltk.stem.porter import PorterStemmer     ## Stemming: returns root word
from sklearn.feature_extraction.text import TfidfVectorizer  ## For converting text to vector
from sklearn.model_selection import train_test_split         ## For splitting the data into training and testing
from sklearn.linear_model import LogisticRegression          ## For logistic regression
from sklearn.metrics import accuracy_score                   ## For accuracy score
from ydata_profiling import ProfileReport                    ## For data profiling
import pickle                                                ## For saving the model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
data.duplicated().sum()

0

In [5]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
data.dropna(inplace=True)
data.shape

(18285, 5)

In [7]:
data.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [8]:
df1_profile = ProfileReport(data, title="fake news")
df1_profile.to_file(output_file='D:\PROJECTS\Fake New prediction\\fakeNews.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 15/15 [00:21<00:00,  1.42s/it, Completed]                
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 124.67it/s]


In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aniket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
x_train = data.drop(['label'], axis=1) 

In [12]:
y_train = data['label']

In [13]:
x_train.shape, y_train.shape

((18285, 4), (18285,))

In [14]:
## Now, to enable the NLP model to understand the news 
## we need to convert the news into vectors.
## at first, we will collapse the names of authors as single entity
## then other columns into one, to put the tfidf vectorizer on it.

In [15]:
def merge_names(name):
    return ''.join(name.replace(" ", ""))


In [16]:
x_train['author'] = x_train['author'].apply(merge_names)

In [17]:
x_train['content'] = x_train['title'] + ' '+ x_train['author'] + ' ' + x_train['text']

In [18]:
main_x_train = x_train.drop(['title', 'author', 'text'], axis=1)

In [19]:
main_x_train.head()

Unnamed: 0,id,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...


In [20]:
## Now, since the content is in one place, we can perform preprocessing on it

In [21]:
port_stem = PorterStemmer()

In [22]:

def preProcessing(content):
    x = 1
    
    print(x)
    x+=1
    # print("Starting Preprocessing")
    content = re.sub('^a-zA-Z', ' ', content)
    # print("re done, starting lower")
    content = content.lower()
    # print("lower done, starting split")
    content = content.split()
    # print("split done, starting stemming")
    for word in content:
        if word not in stopwords.words('english'):
            port_stem.stem(word)

    # print("stemming done, joining")
    content = ' '.join(content)

    return content

In [24]:
main_x_train['content'] = main_x_train['content'].apply(preProcessing)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [25]:
main_x_train.head()

Unnamed: 0,id,content
0,0,house dem aide: we didn’t even see comey’s let...
1,1,"flynn: hillary clinton, big woman on campus - ..."
2,2,why the truth might get you fired consortiumne...
3,3,15 civilians killed in single us airstrike hav...
4,4,iranian woman jailed for fictional unpublished...


In [26]:
X = main_x_train['content']
Y = y_train

In [27]:
## These X and Y are the final data that we will use for training the model
print(X.shape
      , Y.shape)

(18285,) (18285,)


In [28]:
# Now converting the textual data into numerical vectors  using TfidfVectorizer
vectorizer = TfidfVectorizer() ## initializing the tfidf vectorizer object

vectorizer.fit(X)              ## fitting the textural data into it 

X  = vectorizer.transform(X)  ## transforming the text data into numerical data



In [29]:
print(X)

  (0, 149550)	0.008403866291843146
  (0, 149223)	0.04318242427054861
  (0, 149181)	0.00984902532093399
  (0, 149171)	0.010328579162828927
  (0, 148136)	0.03884895045485145
  (0, 148130)	0.08347080374108272
  (0, 148062)	0.03301058331328077
  (0, 148007)	0.012083015159181974
  (0, 147539)	0.024363056245235302
  (0, 147375)	0.022768185645394953
  (0, 147108)	0.007984285592388628
  (0, 146756)	0.03559470466692678
  (0, 146550)	0.007773665744580245
  (0, 146518)	0.024412582903718404
  (0, 146450)	0.024038766606068587
  (0, 146200)	0.014927201927831757
  (0, 146089)	0.011072922719971273
  (0, 145980)	0.026188920106662704
  (0, 145882)	0.012450392982683741
  (0, 145720)	0.06162772563675389
  (0, 145651)	0.010730705466744215
  (0, 145426)	0.052852427950342805
  (0, 144571)	0.02496466265056367
  (0, 144546)	0.019201141597865283
  (0, 144522)	0.015543784906163607
  :	:
  (18284, 6489)	0.014400463178891735
  (18284, 6415)	0.03557936794800139
  (18284, 6055)	0.011181441381066394
  (18284, 5256)	0

In [30]:
# NOw, the ML model can do the predictions on the data
## splitting the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state=2)

# stratify ensures that the distribution of the data is same in both training and testing data, 
# In this case, the splitting containsm the equal distribution of fake and real news in both training and testing data



In [31]:
# Trainign the model: logistic regression, classification model based on features, 
# Signmoid function is used to classify the data into 0 and 1, above the predicted value is more than threshold value, then label is 1, and the news is fake
# else the news is real as the label is 0
model = LogisticRegression()

model.fit(x_train, y_train)


In [32]:
# Now, the model is trained, we can predict the values on the testing data
y_pred = model.predict(x_test)
print("accuracy score: ", accuracy_score(y_test, y_pred)* 100)


accuracy score:  96.00765654908395


In [33]:
X_new = x_test[0] # sample news from testing data

prediction = model.predict(X_new)

if(prediction[0] == 0):
    print("Real news")
else:
    print("Fake news")



Fake news


In [34]:
print(y_test.iloc[0])
# 1 means fake news, 0 means real news

1


In [35]:

pickle.dump(model, open('fake_news_predictor_model.pkl', 'wb'))

In [36]:
pickle.dump(vectorizer, open('fakeNews_tfidf_vectorizer.pkl', 'wb'))