In [2]:
import nltk
import pandas as pd
import re              #package for importing regular expression
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stemmer =WordNetLemmatizer()


In [3]:
dataset=pd.read_csv("fakerealnews.csv")

In [4]:
num_of_rows,y=dataset.shape
dataset.head()



Unnamed: 0,news,label,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0,,,
1,HOUSTON — Venezuela had a plan. It was a ta...,0,,,
2,"Sunday on ABC’s “This Week,” while discussing ...",0,,,
3,"AUGUSTA, Me. — The beleaguered Republican g...",0,,,
4,Finian Cunningham has written extensively on...,1,,,


In [5]:
corpus=[]
nltk.download('wordnet')

for i in range(0,num_of_rows):

  #Removing words which are special character
  document=re.sub(r'\W',' ',dataset['news'][i])

  #Removing single characters from the document
  document=re.sub(r'\s+[a-zA-Z]\s+',' ',document)

  #Removing single character from start
  document=re.sub(r'\^[a-zA-Z]\s+',' ',document)

  #Removing one or more spaces and replacing by one space
  document=re.sub(r'\s+',' ',document,flags=re.I)

  document=document.lower()

  document=document.split()
  document=[stemmer.lemmatize(w) for w in document]
  document=' '.join(document)

  #Now adding it to our corpus
  corpus.append(document)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
#Now we will divide the data for train and test our dataset
x=corpus
y=dataset.label

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [8]:
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer

# tokenizing the text data and counting the occurrences of each token using CountVectorizer
count_vectorizer=CountVectorizer(max_features=1500,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

#fits the vectorizer to the documents (learns the vocabulary) and transforms the documents into matrix
count_train=count_vectorizer.fit_transform(x_train)

#only transform the document into matrix
count_test=count_vectorizer.transform(x_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
#printing tokenised feature names
print(count_vectorizer.get_feature_names_out())

['000' '10' '100' ... 'young' 'youtube' 'zone']


In [10]:
#inserting in a dataframe
pd.DataFrame(count_train.toarray(),columns=count_vectorizer.get_feature_names_out())

Unnamed: 0,000,10,100,11,12,13,15,16,17,18,...,wrong,wrote,yard,year,yes,yet,york,young,youtube,zone
0,2,2,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,4,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,6,0,1,0,0,0,0
218,4,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,1,0,0
219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
220,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

#getting importance of a term (token) in a document
tfidfvectorizer=TfidfVectorizer(max_features=1500,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

tfidf_train=tfidfvectorizer.fit_transform(x_train)

tfidf_test=tfidfvectorizer.transform(x_test)

In [12]:
print(tfidfvectorizer.get_feature_names_out())

['000' '10' '100' ... 'young' 'youtube' 'zone']


In [13]:
#getting importance of a term (token) in a document
pd.DataFrame(tfidf_train.toarray(),columns=tfidfvectorizer.get_feature_names_out())

Unnamed: 0,000,10,100,11,12,13,15,16,17,18,...,wrong,wrote,yard,year,yes,yet,york,young,youtube,zone
0,0.044471,0.047089,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.044681,0.0,0.000000,0.000000,0.000000,0.0,0.0
1,0.000000,0.068460,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.036807,0.0,0.000000,0.0,0.033658,0.033941,0.000000,0.0,0.0
2,0.031788,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.036193,0.0,0.021292,0.0,0.033096,0.000000,0.000000,0.0,0.0
3,0.063500,0.000000,0.021148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.042533,0.0,0.000000,0.016667,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.067977,0.0,0.017611,0.000000,0.000000,0.0,0.0
218,0.162065,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.081414,0.0,0.000000,0.000000,0.052598,0.0,0.0
219,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.031471,0.0,0.000000,0.000000,0.000000,0.0,0.0
220,0.000000,0.035857,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0


In [29]:
# Now with the help of Naive Bayes theorm we will predict the label
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nbclassifier=MultinomialNB()


In [30]:
nbclassifier.fit(count_train,y_train)

In [31]:
y_pred=nbclassifier.predict(count_test)

In [37]:
score=metrics.accuracy_score(y_test,y_pred)

cm=metrics.confusion_matrix(y_pred,y_test,labels=[0,1])
print(cm)
print(score)  #Accuracy score when using count vectoriser class

[[20  5]
 [ 4 27]]
0.8392857142857143


In [36]:
nbclassifier=MultinomialNB()
nbclassifier.fit(tfidf_train,y_train)
y_predict=nbclassifier.predict(tfidf_test)
score=metrics.accuracy_score(y_test,y_predict)
print(score)    #Accuracy score when using tfidf vectoriser class ie weightage of a word
cm=metrics.confusion_matrix(y_predict,y_test,labels=[0,1])
print(cm)

0.8214285714285714
[[20  6]
 [ 4 26]]
