**Author:**  M. Adil Fayyaz

**Email:** adilfayyaz6@gmail.com


In [None]:
# Import Libraries
import os
import copy
import math
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# File Reading

## **Read the Training Set**

Reading the training set of documents and created a dictionary named Train which stores all the data in the the Training dataset provided. The key values of the dictionary are 'Real' and 'Fake' which classify the real news from the fake news.

The data is preprocessed by removing the punctuation marks from the documents

In [None]:
# Mount the Drive 
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5/Train/Real
# Train Set Dictionary
Train = {}
RealData = [[]]
FakeData = [[]]

punctuation1 = ['‘','؟','،','!',"'",' ؔ','َ','ِ','ُ','“','"','”','%','٪','۔',':','(',')','/','|','\\','\n','.','*','&','^','$','=','+','@','{','}','[',']']

for doc in os.listdir(os.getcwd()):
    with open(os.path.join(os.getcwd(), doc), 'r') as filereading: 
      lines = filereading.readlines()
    for i, val in enumerate(lines):
      for x in punctuation1:
        lines[i] = lines[i].replace(x," ")
    RealData.append(lines)

%cd /content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5/Train/Fake
for doc in os.listdir(os.getcwd()):
    with open(os.path.join(os.getcwd(), doc), 'r') as filereading: 
      lines = filereading.readlines()
    for i, val in enumerate(lines):
      for x in punctuation1:
        lines[i] = lines[i].replace(x," ")
    
    FakeData.append(lines)

Train['Real'] = RealData
Train['Fake'] = FakeData
Train['Real'].pop(0)
Train['Fake'].pop(0)
Train

## **Read the Testing Set**

Reading the training set of documents and created a dictionary named Test which stores all the data in the the Testing dataset provided. The key values of the dictionary are 'Real' and 'Fake' which classify the real news from the fake news, to allow us to use them as the true labels and be able to evaluate the Naive Bayes Classifier.

In [None]:
# Read the Testing Files
%cd /content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5/Test/Real
# Test Set Dictionary
Test = {}
RealData2 = [[]]
FakeData2 = [[]]
for doc in os.listdir(os.getcwd()):
    with open(os.path.join(os.getcwd(), doc), 'r') as f: 
      lines = f.readlines()
    for i, val in enumerate(lines):
      for x in punctuation1:
        lines[i] = lines[i].replace(x," ")
      
    RealData2.append(lines)

%cd /content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5/Test/Fake
for doc in os.listdir(os.getcwd()):
    with open(os.path.join(os.getcwd(), doc), 'r') as f: 
      lines = f.readlines()
    for i, val in enumerate(lines):
      for x in punctuation1:
        lines[i] = lines[i].replace(x," ")
    FakeData2.append(lines)
    
Test['Real'] = RealData2
Test['Fake'] = FakeData2
Test['Real'].pop(0)
Test['Fake'].pop(0)
Test

## **Read the Stopwords File**

Read the stop words file and store all the stopping words into a list called stopWords.

Also remove the stop words from the testing set


In [None]:
%cd /content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5
stopWords = []
with open('stopwords-ur.txt','r') as f:
  lines = f.readlines()

for i, value in enumerate(lines):
  val = value.replace('\n',"")
  stopWords.append(val)

TestRemovedStop = copy.deepcopy(Test)
# Remove the stopping words before testing
Classes = {'Real','Fake'}
for c in Classes:
  for i, val in enumerate(TestRemovedStop[c]):
    for j, sentence_list in enumerate(TestRemovedStop[c][i]):
      words = sentence_list.split()
      new_words = []
      for w in words:
        if w not in stopWords:
          new_words.append(w)
      sent = ""
      for w in new_words:
        sent += w
        sent += " "
      TestRemovedStop[c][i][j] = sent


/content/drive/My Drive/NLP Spring 2021/Assignments/Assignment 5


removeStopWords Function takes as parameter the Training set and the classes and removes the stop words from the training set

In [None]:
def removeStopWords(TrainS1, Classes):
  # Remove the stopping words before training
  for c in Classes:
    for i, val in enumerate(TrainS1[c]):
      for j, sentence_list in enumerate(TrainS1[c][i]):
        words = sentence_list.split()
        new_words = []
        for w in words:
          if w not in stopWords:
            new_words.append(w)
        sent = ""
        for w in new_words:
          sent += w
          sent += " "
        TrainS1[c][i][j] = sent

  return TrainS1

# Multinomial Naive Bayes Training

### Helping Functions

Function countTexts takes as parameter the Training set and the Classes i.e. Real of Fake. It Iterates over the classes and returns the total number of documents read in the Training set.

In [None]:
# Return the total number of documents/texts read
def countTexts(Train, Classes):
  sumValue = 0
  for i in Classes:
    sumValue =sumValue +len(Train[i])
  return sumValue

Function extractVocabulary takes as parameter the Training set and the Classes. It returns a list of words - Vocab, which contains the unique words existing in the Training set

In [None]:
def extractVocabulary(Train, Classes):
  Vocab = []
  for i in Classes:
    for j in Train[i]:
      for sentence in j:  
        words = sentence.split()
        for word in words:
          if word not in Vocab:
            Vocab.append(word)
          
  return Vocab

Function countTextsInClass returns the length of documents in a particular class

In [None]:
def countTextsInClass(Train, c):
  return len(Train[c])

Function countWordsInAllTextsOfClass takes as parameter the Training set and a class. It iterates over the training dictionary belonging from the class passed as a parameter in the function. It returns the number of words in all the documents

In [None]:
# Returns the number of words in a class over all the texts
def countWordsInAllTextsOfClass(Train, c):
  count = 0 
  for i in Train[c]:
    for sentence in i:
      words = sentence.split()
      for word in words:
        count += 1  

  return count

Function concatenateTextsInClass takes as parameter the training set and a class. It returns a single document which contains all the words in the documents combined into a single document

In [None]:
# get all the texts of a particular class in a single document
def concatenateTextsInClass(Train, c):
  concatText = []
  for i in Train[c]:
    for sentence in i:
      words = sentence.split()
      for word in words:
        concatText.append(word)

  return concatText

Function countTokensOfWords takes as parameter a single document and a word. It returns the number of times that word occurs in the document

In [None]:
# return number of times w appears in doc
def countTokensOfWords(docC, w):
  count = 0
  for word in docC:
    if word == w:
      count += 1

  return count


## Naive Bayes Training Algorithm

Naive Bayes Training Algorithm using the helping functions defined above. In this algorithm we first extracted the vocabulary - a set of words, from our training set. Then, we got the total count of documents in our training set. Then we iterate over our list of Classes i.e. Real and Fake, and we get the number of documents in a class and the count of words in all the documents of a class. For every word in the vocabulary we can then calculate the token of words and then eventually the conditional probability. As we iterate over all the classes we are then able to calculate the conditional probability for all the words and calculate the prior values for each class.

In [None]:
def NaiveBayesTraining(Train,Classes):
  Vocab = extractVocabulary(Train,Classes)
  totalN_val = countTexts(Train, Classes)
  conditionalProb = {}
  priorC = {}
  for c in Classes:
    Nc = countTextsInClass(Train, c)
    Nw = countWordsInAllTextsOfClass(Train, c)
    priorC[c] = float(Nc/totalN_val)
    # get all texts of a particular class in a single document
    docC = concatenateTextsInClass(Train, c)
    
    for wi in Vocab:
      Ni = countTokensOfWords(docC, wi)
      conditionalProb[(wi,c)] = (Ni+1)/(Nw+len(Vocab))

  return Vocab, priorC, conditionalProb

## Boolean Naive Bayes Training Algorithm

Takes as parameter the Boolean Training Set preprocessed and returns the prior values and conditional probabilities for each word belonging from a class. In the Boolean Naive Bayes Training class duplicate words are removed from the training set and then the priors and conditional probabilities are returned from the function.

In [None]:
def BooleanNaiveBayesTraining(TrainBoolean, Classes):
  return NaiveBayesTraining(TrainBoolean, Classes)
   

## Training both the Models
With and Without Stop Words

**Naive Bayes Training - Without Stop Words Removal**
 
 In this Training, the Naive Bayes Training Algorithm is used without removing the stop words provided in the stop words file 


In [None]:
Classes = {'Real','Fake'}
def NaiveBayes(Train):
  return NaiveBayesTraining(Train, Classes)
V, priorC, conditionalProb = NaiveBayes(Train)

**Boolean Naive Bayes Training - Without Stop Words Removal**

In this Training, the Boolean Naive Bayes Algorithm is used. The only difference is, that in this training the duplicate words in the file are removed. The stop words provided in the stop words file are not removed. 

In [None]:
def BoolNaiveBayes(Train):
  TrainC = copy.deepcopy(Train)
  # Remove duplicate words in each document
  for c in Classes:
    for i,val in enumerate(TrainC[c]):
      wordsUnique = []
      for j, sentence_list in enumerate(TrainC[c][i]):
        words = sentence_list.split()
        for word in words:
          if word not in wordsUnique:
            wordsUnique.append(word)

      sentence_form = ""
      sentence_form_list = []
    
      # Converting words list to sentence form, requirement of the training function
      for uword in wordsUnique:
        sentence_form += uword
        sentence_form += " "
      sentence_form_list.append(sentence_form) 
      TrainC[c][i] = sentence_form_list

  Vbool, priorCbool, condProbBool = BooleanNaiveBayesTraining(TrainC, Classes)
  return TrainC, Vbool, priorCbool, condProbBool

TrainC, Vbool, priorCbool, condProbBool = BoolNaiveBayes(Train)


**Naive Bayes Training - Removing Stop Words**

In this Training, the Naive Bayes Training Algorithm is used and the stop words provided in the stop words file is also removed from the training set (and later the testing set)

In [None]:
def NaiveBayesStop(Train):
  Classes = {'Real','Fake'}
  TrainS1 = copy.deepcopy(Train)
  TrainS1 = removeStopWords(TrainS1,Classes)
  VStop, priorCStop, conditionalProbStop = NaiveBayesTraining(TrainS1, Classes)
  return TrainS1, VStop, priorCStop, conditionalProbStop
TrainS1, VStop, priorCStop, conditionalProbStop = NaiveBayesStop(Train)

**Boolean Naive Bayes - Removing Stop Words**

In this Training, the Boolean Naive Bayes Algorithm is used, (duplicate words removed) while also removing the stop words, provided in the stop words file,from the training set.

In [None]:
def BoolNaiveBayesStop(TrainC):
  Classes = {'Real','Fake'}
  TrainS2 = copy.deepcopy(TrainC)
  TrainS2 = removeStopWords(TrainS2,Classes)
  VStopBool, priorCStopBool, condProbStopBool = BooleanNaiveBayesTraining(TrainS2, Classes)
  return TrainS2, VStopBool, priorCStopBool, condProbStopBool
TrainS2, VStopBool, priorCStopBool, condProbStopBool = BoolNaiveBayesStop(TrainC)

# **Application of Naive Bayes**

### Naive Bayes Test - Find Score Values

Function extractWordsFromText takes as parameter the Vocabulary and the text. It returns a list of words in the text that belong from/exist in the Vocabulary

In [None]:
def extractWordsFromText(Vocab , text):
  words_list = []
  
  sentence_form = ""
  xwords = text.split()
  for w in xwords:
    sentence_form += w
    sentence_form += ' '
  words =sentence_form.split()
  for w in words:
    if w in Vocab: 
      words_list.append(w)

  return words_list


Function NaiveBayesTest takes as parameter the classes, the vocabulary, the prior values, the conditional probability and text from the Testing set. 

The function calculates the log score values for each of our word found in the vocabulary. The log scores are computed from the conditional probability of the words that were calculated in the Naive Bayes Algorithm above.

The function returns the maximum score class that is either Real or Fake.

In [None]:
def NaiveBayesTest(Classes, V, prior, condprob, text):
  W = extractWordsFromText(V, text)
  scoreValue = {}
  for c in Classes:
    scoreValue[c] = math.log(prior[c])
    for word in W:
        scoreValue[c] += math.log(condprob[(word,c)])
      
   
  return max(scoreValue, key=scoreValue.get)


### Evaluation Without Stop Words Removal

Evaluation of Naive Bayes without stop word removal

In [None]:
# Naive Bayes Only

correct_labels = []
predicted_labels = []
for c in Classes:
  for q, val in enumerate(Test[c]):
    s = ""
    for sentence_list in val:
      s += sentence_list
      s += " "   
    recv_class = NaiveBayesTest(Classes, V, priorC, conditionalProb,s)
    predicted_labels.append(recv_class)
    correct_labels.append(c)
   
print("Accuracy is: " , accuracy_score(correct_labels, predicted_labels, normalize = True))
print("Precision is: ", precision_score(correct_labels, predicted_labels, average='macro'))
print("Recall is: ", recall_score(correct_labels, predicted_labels, average='macro'))
print("F1 Score is: ", f1_score(correct_labels, predicted_labels, average="macro"))
        


Accuracy is:  0.6946564885496184
Precision is:  0.6877104377104377
Recall is:  0.6858333333333333
F1 Score is:  0.6866028708133971


Evaluation of Boolean Naive Bayes without stop word removal

In [None]:
# Boolean Naive Bayes Only

correct_labels = []
predicted_labels = []
for c in Classes:
  for q, val in enumerate(Test[c]):
    s = ""
    for sentence_list in val:
      s += sentence_list
      s += " "
    recv_class = NaiveBayesTest(Classes, Vbool, priorCbool, condProbBool,s)
    predicted_labels.append(recv_class)
    correct_labels.append(c)
    


print("Accuracy is: " , accuracy_score(correct_labels, predicted_labels, normalize = True))
print("Precision is: ", precision_score(correct_labels, predicted_labels, average='macro'))
print("Recall is: ", recall_score(correct_labels, predicted_labels, average='macro'))
print("F1 Score is: ", f1_score(correct_labels, predicted_labels, average="macro"))

Accuracy is:  0.7137404580152672
Precision is:  0.7094250043959909
Recall is:  0.7126785714285715
F1 Score is:  0.710189814609973


Here, as the evaluations can be seen the Boolean Naive Bayes version (with duplicates removal) seems to perform better than the normal Naive Bayes Version. The reason being that the total number of words in each class become less so the conditional probability will only be considering if a certain word exists in a particular class or not. With that logic, the Boolean version performs better and shows a better result because it classifies words into classes and learns if a certain word belongs from that class or not rather than considering the number of times it occurs in a particular document. 

It is important to note here that even though removing duplicate words (in the Boolean version) is showing a better accuracy result, in fact this technique is actually rather ambigious. In this case it showed better results, however, if we remove the duplicate words from the training set, we are actually disturbing the context of sentences in our training data. The disturbance in the context will not be conclusive results or observations, in fact it will only add to the ambiguity. 

### Evaluation With Stop Words Removal

Evaluation of Naive Bayes Algorithm with stop words removal

In [None]:
# Naive Bayes with stop words removed Only
correct_labels = []
predicted_labels = []
for c in Classes:
  for q, val in enumerate(TestRemovedStop[c]):
    s = ""
    for sentence_list in val:
      s += sentence_list
      s += " "   
    recv_class = NaiveBayesTest(Classes, VStop, priorCStop, conditionalProbStop,s)
    predicted_labels.append(recv_class)
    correct_labels.append(c)

print("Accuracy is: " , accuracy_score(correct_labels, predicted_labels, normalize = True))
print("Precision is: ", precision_score(correct_labels, predicted_labels, average='macro'))
print("Recall is: ", recall_score(correct_labels, predicted_labels, average='macro'))
print("F1 Score is: ", f1_score(correct_labels, predicted_labels, average="macro"))


Accuracy is:  0.7022900763358778
Precision is:  0.6958928571428571
Recall is:  0.6958928571428571
F1 Score is:  0.6958928571428571


Evaluation of Boolean Naive Bayes Algorithm with stop words removal

In [None]:
# Boolean Naive Bayes with stop words removed Only
correct_labels = []
predicted_labels = []
for c in Classes:
  for q, val in enumerate(TestRemovedStop[c]):
    s = ""
    for sentence_list in val:
      s += sentence_list
      s += " "   
    recv_class = NaiveBayesTest(Classes, VStopBool, priorCStopBool, condProbStopBool,s)
    predicted_labels.append(recv_class)
    correct_labels.append(c)

print("Accuracy is: " , accuracy_score(correct_labels, predicted_labels, normalize = True))
print("Precision is: ", precision_score(correct_labels, predicted_labels, average='macro'))
print("Recall is: ", recall_score(correct_labels, predicted_labels, average='macro'))
print("F1 Score is: ", f1_score(correct_labels, predicted_labels, average="macro"))

Accuracy is:  0.7213740458015268
Precision is:  0.7152665347484559
Recall is:  0.7136904761904762
F1 Score is:  0.7143795827546555


Here, with the stop words removed from the Training set, we can see that the stop word removal resulted in a better accuracy, precision, recall and F1 score as compared to the normal naive bayes version. This is because stop words are high frequency words in a document that add unnecessary requirements on the classifier, in terms of both space and time complexity. In other words, removing stop words helps us classify the more important root words existing in the Fake and Real new documents, removing abundant words from the documents lets us classify those root words in their true essence. Similarly, the model also shows an increase in evaluation metrics with the removal of stop words. 

## Evaluation using Negation

Function NegationWordsTrain removes the words following negation words from the training set. It must be noted here that the logic employed was that every word that follows a negation word is removed from the list of words in the training set. I experimented using a list of negation words, some of which are commented out below, and came to the conclusion that when only the single word 'نہیں' is removed from the training set, the accuracy and precision increases as compared to my previous Naive Bayes attempt. In contrast, when I increase the set of negation words, my accuracy is not improved. However, it is still likely if the the negation words set is improved (added more proficient words) the model may show an increase in terms of accuracy and precision. 

In [None]:
# Extra Credit - Remove Negation Words
def NegationWordsTrain(TrainS2):
  negation_words = ['نہیں'] #'غیر', 'کبھی', 'اندازن', 'بمشکل', 'مشکل']
  Classes = {'Real','Fake'}
  Train_Negation = copy.deepcopy(TrainS2)
  # Remove the stopping words before training
  for c in Classes:
    for i, val in enumerate(Train_Negation[c]):
      for j, sentence_list in enumerate(Train_Negation[c][i]):
        words = sentence_list.split()
        new_words = []
        
        for w in range(len(words)):
          try:
            if words[w] in negation_words:
              # print(w+1)
              del words[w+1]
            new_words.append(words[w])
          except:
            xyz = 1
        sent = ""
        for w in new_words:
          sent += w
          sent += " "
        Train_Negation[c][i][j] = sent
  VN, priorCN, conditionalProbN = BooleanNaiveBayesTraining(Train_Negation, Classes)
  return Train_Negation, VN, priorCN, conditionalProbN
Train_Negation, VN, priorCN, conditionalProbN = NegationWordsTrain(TrainS2)



Evaluation of the Negation Training (stop words also removed) 

Adds accuracy when results compared to previous attempts

In [None]:
# Naive Bayes Negation 

correct_labels = []
predicted_labels = []
for c in Classes:
  for q, val in enumerate(TestRemovedStop[c]):
    s = ""
    for sentence_list in val:
      s += sentence_list
      s += " "   
    recv_class = NaiveBayesTest(Classes, VN, priorCN, conditionalProbN,s)
    predicted_labels.append(recv_class)
    correct_labels.append(c)
   
print("Accuracy is: " , accuracy_score(correct_labels, predicted_labels, normalize = True))
print("Precision is: ", precision_score(correct_labels, predicted_labels, average='macro'))
print("Recall is: ", recall_score(correct_labels, predicted_labels, average='macro'))
print("F1 Score is: ", f1_score(correct_labels, predicted_labels, average='macro'))

Accuracy is:  0.7251908396946565
Precision is:  0.7192159692159692
Recall is:  0.7170238095238095
F1 Score is:  0.7179425837320574


The results of accuracy, precision, recall and f1 scores show an increase as compared to any of the previous results. This is because negation words are difficult to classify. By using negation words, the context of an entire sentence can change, and that is why if we remove the words following the negation words, it will be easier to classify the documents, because there will be less room left for any ambiguity. 