In [1]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
# read data and append label with text
statements=[]
labels=[]
main_path = '/content/drive/My Drive/Fake news detection/Pakistani Media Dataset/'
# reading training data
with open(main_path+'pakistani_dataset_consolidated_features.xlsx', 'r') as csv_file:
  csv_reader = csv.reader(csv_file)
  next(csv_reader)
  for line in csv_reader:
    statements.append(line[0])
    if line[11] == 'FALSE':
      labels.append(1)
    elif line[11] == 'TRUE':
      labels.append(0)

In [3]:
# calculate bigrams using TF/IDF
tf_idf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (2,2), stop_words = 'english')
features = tf_idf.fit_transform(statements)
pd.DataFrame(
	features.todense(),
	columns=tf_idf.get_feature_names()
)
features = features.toarray() 

In [5]:
x_train = features[0:9592]
y_train = labels[0:9592]

x_test = features[9592:]
y_test = labels[9592:]

In [6]:
# creating model
NB_classifier = MultinomialNB()

# training the model
NB_classifier.fit(x_train, y_train)

# saving the model file
pickle.dump(NB_classifier, open('/content/drive/My Drive/Fake news detection/Traditional methods/Naive_Bayes/Custom Dataset/NB_Bigram.sav', 'wb'))

In [7]:
# test Naive Bayes model
pred = NB_classifier.predict(x_test)

In [8]:
# checking accuracy, precision, recall and F1 scores
print ("Accuracy: ")
print (accuracy_score(y_test, pred))

print ("Precision: ")
print (precision_score(y_test, pred, average="weighted"))

print ("Recall: ")
print (recall_score(y_test, pred, average="weighted"))

print ("F1 Score ")
print (f1_score(y_test, pred, average="weighted"))

print ("Confusion Matrix ")
print (confusion_matrix(y_test, pred))

Accuracy: 
0.658882402001668
Precision: 
0.6263095569280978
Recall: 
0.658882402001668
F1 Score 
0.5626281784162811
Confusion Matrix 
[[1514   53]
 [ 765   66]]
