# Import All Packages




In [1]:
    import random
    import pandas as pd
    import numpy as np
    import string
    from string import digits
    from sklearn.model_selection import train_test_split
    import torch
    import torch.nn as nn
    from sklearn.metrics import classification_report
    import transformers
    from transformers import AutoModel, BertTokenizerFast
    from ipywidgets import IntProgress
    
    from tqdm import tqdm



# Import BERT Model, BERT Tokenizer and Torch

In [2]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# import Torch
device = torch.device("cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Predefined CLass

In [3]:
class Propaganda:
    NEGATIVE = 0
    POSITIVE = 1
class Review:
    def __init__(self,sentence,SUBJprop):
        self.sentence = sentence
        self.SUBJprop = SUBJprop
        self.propaganda = SUBJprop


class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_sentence(self):
        return [x.sentence for x  in self.reviews]

    def get_propaganda(self):
        return [int(x.propaganda) for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), self.reviews))
        positive = list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), self.reviews))
        negative_shrunk = negative[:len(positive)]
        self.reviews = positive + negative_shrunk
        random.shuffle(self.reviews)

# Split train dataset into train, validation and test sets

In [4]:
# step 2.1: Load Data
reviews = []
data  = pd.read_excel('Data/finalDataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))


print("Total Rows:")
print(len(reviews))
print("Total Positive:")
print(len(list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), reviews))))
print("Total Negative:")
print(len(list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), reviews))))

Total Rows:
14058
Total Positive:
3904
Total Negative:
10154


In [5]:
# step 2.2: Load Sentimental Data
SentimentWords= []
SentimentValue= []


Sentimentdata  = pd.read_excel('Data/SentimentalWords.xlsx', engine='openpyxl')
df = pd.DataFrame(Sentimentdata.astype(str) , columns = ['word','value'])

for index, row in df.iterrows():
    word = row['word']
    value = row['value']
    SentimentWords.append(word)
    SentimentValue.append(value)

In [6]:
neg_prop = list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), reviews))
pos_prop = list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), reviews))
########################################################################################
#split trainig and DevTest dataset
neg_train, neg_devtest  = train_test_split(neg_prop , train_size=0.7, shuffle= False )
pos_train, pos_devtest = train_test_split(pos_prop , train_size=0.7, shuffle= False )
########################################################################################
#prepare training dataset
train = neg_train + pos_train
#random.shuffle(train)
########################################################################################
#prepare development and test dataset
neg_dev, neg_test = train_test_split(neg_devtest , train_size=0.5, shuffle= False )
pos_dev, pos_test = train_test_split(pos_devtest , train_size=0.5, shuffle= False )

dev = neg_dev + pos_dev
#random.shuffle(dev)

test = neg_test + pos_test
#random.shuffle(test)

In [7]:
# step 3: Seperate the attribute, originally our array has text and score. we want them to be a seperate array
train_container = ReviewContainer(train)
train_container.evenly_distribute()

train_text = train_container.get_sentence()   
train_labels = train_container.get_propaganda() 

dev_text = [x.sentence for x in dev]
dev_labels = [int(x.propaganda) for x in dev]

test_text = [x.sentence for x in test]
test_labels = [int(x.propaganda) for x in test]



### Save splited Data to seprate excel files

In [8]:
# Create Excel file of train dataset
train_df = pd.DataFrame(train_text, columns=["Sentence"])
train_df['SUBJprop'] = train_labels
train_df.to_excel("./Data/trainDataset.xlsx")
# Create Excel file of dev dataset
dev_df = pd.DataFrame(dev_text, columns=["Sentence"])
dev_df['SUBJprop'] = dev_labels
dev_df['Tanbih'] = ""
dev_df.to_excel("./Data/devDataset.xlsx")
# Create Excel file of test dataset
test_df = pd.DataFrame(test_text, columns=["Sentence"])
test_df['SUBJprop'] = test_labels
test_df.to_excel("./Data/testDataset.xlsx")

In [9]:
max_seq_len = 20


# Tokenization and Filtering Punctuation

In [10]:
from nltk.tokenize import RegexpTokenizer
tokenizerNLTK = RegexpTokenizer(r'\w+')

train_text_tokenized = []
dev_text_tokenized = []
test_text_tokenized = []


for i in range(len(train_text)):
    train_text_tokenized.append(tokenizerNLTK.tokenize(train_text[i])) 

for i in range(len(dev_text)):
    dev_text_tokenized.append(tokenizerNLTK.tokenize(dev_text[i])) 

for i in range(len(test_text)):
    test_text_tokenized.append(tokenizerNLTK.tokenize(test_text[i])) 
    

print(test_text_tokenized[0:5])



[['WHO', 'said', 'there', 'should', 'not', 'be', 'restrictions', 'on', 'international', 'travel', 'or', 'trade'], ['In', 'the', 'meantime', 'it', 's', 'important', 'to', 'prevent', 'oneself', 'from', 'contracting', 'the', 'virus'], ['Unfortunately', 'health', 'care', 'workers', 'are', 'at', 'the', 'greatest', 'risk', 'as', 'they', 'try', 'to', 'cull', 'the', 'suffering', 'which', 'is', 'why', 'they', 'will', 'be', 'the', 'first', 'to', 'get', 'the', 'experimental', 'vaccine'], ['Pope', 'Francis', 'sexual', 'abuse', 'commission', 'hit', 'by', 'resignations', 'and', 'criticism', 'gets', 'a', 'reboot'], ['Last', 'week', 'Pope', 'Francis', 'announced', 'he', 'was', 'reviving', 'a', 'panel', 'he', 'created', 'to', 'advise', 'the', 'Vatican', 'on', 'how', 'to', 'handle', 'sexual', 'abuse', 'by', 'clergy']]


# Removing stop words

In [11]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()


# Create list of word tokens after removing stopwords in train_text
train_text1=[]
for sentence in train_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word) 
    train_text1.append(temp)
    
# print(train_text1[:100])
# print("*****************************************************")

# Create list of word tokens after removing stopwords in dev_text
dev_text1 =[] 

for sentence in dev_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word) 
    dev_text1.append(temp)
    
# print(dev_text1[:100])
# print("*****************************************************")

# # Create list of word tokens after removing stopwords in test_text
test_text1 =[] 

for sentence in test_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word) 
    test_text1.append(temp)
# print(test_text1[:100])  


# Repetition

In [None]:
# train_text2=[]
# SentimentWords

# def checkIfDuplicates(listOfElems):
#     if len(listOfElems) == len(set(listOfElems)):
#         return False
#     else:
#         return True

# for sentence in train_text1:
#     temp=[]
#     if checkIfDuplicates(sentence):
#         print(sentence)



    # for word in sentence:
    #     temp.append(word)

    # train_text2.append(temp)

# print(train_text2[:10])

# Duplicate Sentimenal words

In [39]:
train_text2=[]

for sentence in train_text1:
    temp=[]
    for word in sentence:
        temp.append(word)
        for SentimentWord in SentimentWords:
            if word.lower()==SentimentWord:
                temp.append(word)
                temp.append(word)

    train_text2.append(temp)

print(train_text2[:10])

[['important', 'theology', 'sin'], ['Editor', 's', 'Note', 'Remnant', 'lunched', 'StopTheSynod', 'petition', 'Change', 'org'], ['seriously', 'sanctity', 'institution', 'individuals', 'continue', 'suffer', 'wages', 'sin'], ['terrible', 'insult', 'spark', 'decisions', 'Japan', 'proceed', 'ahead', 'covert', 'nuclear', 'program'], ['art', 'deal'], ['depends'], ['Rush', 'Limbaugh', 'points', 'Senate', 'bent', 'backwards', 'accommodate', 'accuser'], ['Khecharem', 'predicted', 'happen', 'course', 'elimination', 'filthy', 'Jewish', 'entity', 'liberation', 'lands', 'direct', 'colonization', 'like', 'Kashmir'], ['Concerned', 'clergy', 'laity', 'Catholic', 'world', 'according', 'stations'], ['Ms', 'Ford', 's', 'unsubstantiated', 'accusation', 'completely', 'destroy', 'Judge', 'Kavanaugh', 's', 'life', 'causing', 'irreparable', 'damage', 'reputation', 'integrity', 'good', 'character', 'career', 'built', 'decades', 'public', 'service']]


# Detokenizer

In [13]:
train_text_detokenized=[]
for i in range(len(train_text2)):
    temp=""
    for j in range(len(train_text2[i])):
        temp=temp + " " + train_text2[i][j]        
    train_text_detokenized.append(temp)
# print(train_text_detokenized[:100])



dev_text_detokenized=[]
for i in range(len(dev_text1)):
    temp=""
    for j in range(len(dev_text1[i])):
        temp=temp + " " + dev_text1[i][j]        
    dev_text_detokenized.append(temp)
# print(dev_text_detokenized[:100])



test_text_detokenized=[]
for i in range(len(test_text1)):
    temp=""
    for j in range(len(test_text1[i])):
        temp=temp + " " + test_text1[i][j]        
    test_text_detokenized.append(temp)
print(test_text_detokenized[:100])

[' said restrictions international travel trade', ' meantime s important prevent oneself contracting virus', ' Unfortunately health care workers greatest risk try cull suffering experimental vaccine', ' Pope Francis sexual abuse commission hit resignations criticism gets reboot', ' week Pope Francis announced reviving panel created advise Vatican handle sexual abuse clergy', ' issue dogged Roman Catholic Church recent years critics accused Francis predecessors John Paul II Benedict XVI failing aggressively weed punish predator priests', ' s look drove panel s creation expect future', ' Pope Francis commission sexual abuse', ' Following criticism focusing halting child abuse church Francis created Pontifical Commission Protection Minors March 2014 naming Cardinal Sean O Malley archbishop Boston run', ' O Malley member pope s G9 group close advisors man sought clean abuse scandal Boston departure predecessor Cardinal Bernard Law shifting predator priests new posts exposed Boston Globe ch

In [14]:

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text_detokenized,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    dev_text_detokenized,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text_detokenized,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



# Convert Integer Sequences to Tensors

In [15]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

print(train_seq[0])

# for validation set
dev_seq = torch.tensor(tokens_val['input_ids'])
dev_mask = torch.tensor(tokens_val['attention_mask'])
dev_y = torch.tensor(dev_labels)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)


print(test_seq)
print(test_mask)
print(test_y)

tensor([ 101, 2590, 8006, 8254,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
tensor([[  101,  2056,  9259,  ...,     0,     0,     0],
        [  101, 12507,  1055,  ...,     0,     0,     0],
        [  101,  6854,  2740,  ...,     0,     0,     0],
        ...,
        [  101,  3218, 16263,  ...,     0,     0,     0],
        [  101,  2113,  3599,  ...,     0,     0,     0],
        [  101,  6715,  2015,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 0,  ..., 1, 1, 1])


# Classifiers

## SVM

In [16]:

from sklearn import svm

clf_svm = svm.SVC(kernel='rbf',C=1, probability=True)
clf_svm.fit(train_seq, train_y)

# clf_svm.predict(test_seq)
# print(len(test_seq))


SVC(C=1, probability=True)

## Decision Tree

In [17]:

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_seq , train_y)

clf_dec.predict(test_seq)

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

## Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_seq , train_y)

clf_gnb.predict(test_seq)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_seq, train_y)

clf_log.predict(test_seq)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# Evaluation

# F1

In [20]:
from sklearn.metrics import f1_score

# For Support Vector Machine
print(f1_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Decision Tree
print(f1_score(test_y,clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Naive Bayes
print(f1_score(test_y,clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Logistic Regression
print(f1_score(test_y,clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


[0.41007698 0.69795772]
[0.40204212 0.64681493]
[0.37735849 0.73134328]
[0.39106145 0.68723099]


## Mean Accuracy

In [21]:
# For Support Vector Machine
print(clf_svm.score(test_seq,test_y))
# For Decision Tree
print(clf_dec.score(test_seq,test_y))
# For Decision Naive Bayes
print(clf_gnb.score(test_seq,test_y))
# For Logistic Regression
print(clf_log.score(test_seq,test_y))

0.6004739336492891
0.5559241706161138
0.6246445497630332
0.5867298578199052


## Precision

In [22]:
# Precision
from sklearn.metrics import precision_score

# For Support Vector Machine
print(precision_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(precision_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(precision_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(precision_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

[0.34756821 0.76874507]
[0.32110092 0.75996457]
[0.34985423 0.75702247]
[0.33096927 0.75791139]


## Recall

In [23]:
from sklearn.metrics import recall_score

# For Support Vector Machine
print(recall_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(recall_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(recall_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(recall_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

[0.5        0.63910761]
[0.53754266 0.56299213]
[0.40955631 0.70734908]
[0.4778157  0.62860892]


# VIP : Predict_Proba using Threshold

In [24]:
AboveThresholdsvm = [] 
AboveThresholdsvmbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdsvm.append(test_text[i])
    AboveThresholdsvmbrop.append(test_labels[i])



print(len(AboveThresholdsvm))


466


In [28]:
AboveThresholdLR = [] 
AboveThresholdLRbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdLR.append(test_text[i])
    AboveThresholdLRbrop.append(test_labels[i])



print(len(AboveThresholdLR))

75


In [26]:
# Bert
tokens_testsvm = tokenizer.batch_encode_plus(
    AboveThresholdsvm,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqsvm = torch.tensor(tokens_testsvm['input_ids'])
test_masksvm = torch.tensor(tokens_testsvm['attention_mask'])
test_ysvm = torch.tensor(AboveThresholdsvmbrop)

tokens_testLR = tokenizer.batch_encode_plus(
    AboveThresholdLR,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)


test_seqLR = torch.tensor(tokens_testLR['input_ids'])
test_maskLR = torch.tensor(tokens_testLR['attention_mask'])
test_yLR = torch.tensor(AboveThresholdLRbrop)



## Evaluation

In [29]:
#F1
print("F1 SVM more than 60% threshold")
print(f1_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("F1 LR more than 60% threshold")
print(f1_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

F1 SVM more than 60% threshold
[0.43577982 0.50403226]
F1 LR more than 60% threshold
[0.388      0.71823204]


In [30]:
# Accuracy
# For Support Vector Machine
print("Accuracy SVM more than 60% threshold")
print(clf_svm.score(test_seqsvm,test_ysvm))
print("Accuracy LR more than 60% threshold")
print(clf_log.score(test_seqLR,test_yLR))

Accuracy SVM more than 60% threshold
0.4721030042918455
Accuracy LR more than 60% threshold
0.6141235813366961


In [31]:
# Precision
from sklearn.metrics import precision_score
# For Support Vector Machine
print("Precision SVM more than 60% threshold")
print(precision_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
print("Precision LR more than 60% threshold")
print(precision_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


Precision SVM more than 60% threshold
[0.33807829 0.67567568]
Precision LR more than 60% threshold
[0.34892086 0.75728155]


In [32]:
# Recall
from sklearn.metrics import recall_score
print("Precision SVM more than 60% threshold")
print(recall_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Precision LR more than 60% threshold")
print(recall_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

Precision SVM more than 60% threshold
[0.61290323 0.40192926]
Precision LR more than 60% threshold
[0.43693694 0.68301226]


## Bagging

In [1]:
Output=[]
for i in range(len(test_seq)):
  Vote=0
  if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_dec.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_gnb.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1


  if (Vote>1):
    Output.append("Propaganda")
  else:
    Output.append("nonPropaganda")


print(Output[:200])



NameError: name 'test_seq' is not defined

In [47]:
CountPropaganda=0
for i in range(len(Output)):
    if (Output[i]=="Propaganda"):
        CountPropaganda=CountPropaganda+1

print("number of propandas in output:")
print(CountPropaganda)

number of propandas in output:
0
