# Install Transformers Library



In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cpu")

# Load Dataset


# Split train dataset into train, validation and test sets

In [3]:
#step 1: Predefined CLass
import random

class Propaganda:
    NEGATIVE = 0
    POSITIVE = 1

class Review:
    def __init__(self,sentence,SUBJprop):
        self.sentence = sentence
        self.SUBJprop = SUBJprop
        self.propaganda = self.get_propaganda()        

    def get_propaganda(self):
        if int(self.SUBJprop) <=3 :
            return Propaganda.NEGATIVE
        else:
            return Propaganda.POSITIVE


class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_sentence(self):
        return [x.sentence for x  in self.reviews]

    def get_propaganda(self):
        return [x.propaganda for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, self.reviews))
        negative_shrunk = negative[:len(positive)]
        self.reviews = positive + negative_shrunk
        random.shuffle(self.reviews)

In [4]:
# step 2: Load Data
import pandas as pd
import numpy as np
import string
from string import digits


reviews = []
data  = pd.read_excel('Data/finalDataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))


print("Total Rows:")
print(len(reviews))
print("Total Positive:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))))
print("Total Negative:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))))

# print(reviews[0].getSUBJprop)
# sentenceArray = [ x.sentence for x  in reviews] 

Total Rows:
16774
Total Positive:
3780
Total Negative:
12994


In [5]:
# step 2: Prep Data (split into train and test set)
from sklearn.model_selection import train_test_split

neg_prop = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))
pos_prop = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))

########################################################################################
#split trainig and DevTest dataset
neg_train, neg_devtest  = train_test_split(neg_prop , train_size=0.7 ,shuffle=False)
pos_train, pos_devtest = train_test_split(pos_prop , train_size=0.7,shuffle = False)
########################################################################################
#prepare training dataset
train = neg_train + pos_train
#random.shuffle(train)
print(train[0].sentence)
########################################################################################
#prepare development and test dataset
neg_dev, neg_test = train_test_split(neg_devtest , train_size=0.5,shuffle = False )
pos_dev, pos_test = train_test_split(pos_devtest , train_size=0.5,shuffle = False)

dev = neg_dev + pos_dev
#random.shuffle(dev)
print(dev[0].sentence)
test = neg_test + pos_test
#random.shuffle(test)
print(test[0].sentence)
print(len(dev))
########################################################################################

miley and liam fighting false rumors swirl that theyre in a feud over a supposed prenup
Pietro Card.
Nothing is said of increased intensity of devotion to the charism, study of the foundress or returning to sources, still less of strengthening the Carmelites’ traditional independent self-governance or reconsidering the wisdom of “the path taken”.
2516


In [6]:
# step 3: Seperate the attribute, originally our array has text and score. we want them to be a seperate array
train_container = ReviewContainer(train)
train_container.evenly_distribute()

train_text = train_container.get_sentence()   
train_labels = train_container.get_propaganda() 

dev_text = [x.sentence for x in dev]
dev_labels = [x.propaganda for x in dev]

test_text = [x.sentence for x in test]
test_labels = [x.propaganda for x in test]



# Import BERT Model and BERT Tokenizer

In [7]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Tokenization


In [8]:
max_seq_len = 20


In [9]:

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    dev_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



# Convert Integer Sequences to Tensors

In [10]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)

print(test_seq)
print(test_mask)
print(test_y)

tensor([[  101,  2498,  2003,  ...,  2179,  8303,   102],
        [  101,  5262,  1010,  ..., 12645, 10462,   102],
        [  101,  1996,  3189,  ...,  2871,  2086,   102],
        ...,
        [  101,  1996,  2155,  ...,  2214,  2775,   102],
        [  101,  2019,  4358,  ...,  1010,  2429,   102],
        [  101,  1996,  7865,  ..., 11595,  1010,   102]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([0, 0, 0,  ..., 1, 1, 1])


# Classifiers

## SVM

In [11]:

from sklearn import svm

clf_svm = svm.SVC(kernel='rbf',C=1, probability=True)
clf_svm.fit(train_seq, train_y)

clf_svm.predict(test_seq)

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

## Decision Tree

In [12]:

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_seq , train_y)

clf_dec.predict(test_seq)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_seq , train_y)

clf_gnb.predict(test_seq)

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

## Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_seq, train_y)

clf_log.predict(test_seq)

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

# Evaluation

# F1

In [15]:
from sklearn.metrics import f1_score

# For Support Vector Machine
print(f1_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Decision Tree
print(f1_score(test_y,clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Naive Bayes
print(f1_score(test_y,clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Logistic Regression
print(f1_score(test_y,clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


[0.33864916 0.51412819]
[0.3557371  0.25886373]
[0.34716644 0.47619048]
[0.33349034 0.51391274]


## Mean Accuracy

In [16]:
# For Support Vector Machine
print(clf_svm.score(test_seq,test_y))
# For Decision Tree
print(clf_dec.score(test_seq,test_y))
# For Decision Naive Bayes
print(clf_gnb.score(test_seq,test_y))
# For Logistic Regression
print(clf_log.score(test_seq,test_y))

0.4398092967818832
0.31068732618196265
0.4187524831148192
0.4378228049264998


## Precision

In [17]:
# Precision
from sklearn.metrics import precision_score

# For Support Vector Machine
print(precision_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(precision_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(precision_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(precision_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

[0.23067093 0.78361345]
[0.22530574 0.77493606]
[0.23237754 0.78884935]
[0.22750643 0.77835588]


## Recall

In [18]:
from sklearn.metrics import recall_score

# For Support Vector Machine
print(recall_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(recall_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(recall_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(recall_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

[0.6366843 0.3825641]
[0.84479718 0.15538462]
[0.68606702 0.34102564]
[0.62433862 0.38358974]


# VIP : Predict_Proba using Threshold

In [25]:
AboveThresholdsvm = [] 
AboveThresholdsvmbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdsvm.append(test_text[i])
    AboveThresholdsvmbrop.append(test_labels[i])



print(len(AboveThresholdsvm))


238


In [26]:
AboveThresholdLR = [] 
AboveThresholdLRbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdLR.append(test_text[i])
    AboveThresholdLRbrop.append(test_labels[i])



print(len(AboveThresholdsvm))

238


In [29]:
# Bert
tokens_testsvm = tokenizer.batch_encode_plus(
    AboveThresholdsvm,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqsvm = torch.tensor(tokens_testsvm['input_ids'])
test_masksvm = torch.tensor(tokens_testsvm['attention_mask'])
test_ysvm = torch.tensor(AboveThresholdsvmbrop)

tokens_testLR = tokenizer.batch_encode_plus(
    AboveThresholdLR,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)


test_seqLR = torch.tensor(tokens_testLR['input_ids'])
test_maskLR = torch.tensor(tokens_testLR['attention_mask'])
test_yLR = torch.tensor(AboveThresholdLRbrop)

## Evaluation

In [33]:
#F1
print("F1 SVM more than 60% threshold")
print(f1_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("F1 LR more than 60% threshold")
print(f1_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

F1 SVM more than 60% threshold
[0.23880597 0.70175439]
F1 LR more than 60% threshold
[0.25       0.80239521]


In [34]:
# Accuracy
# For Support Vector Machine
print("Accuracy SVM more than 60% threshold")
print(clf_svm.score(test_seqsvm,test_ysvm))
print("Accuracy LR more than 60% threshold")
print(clf_log.score(test_seqLR,test_yLR))

Accuracy SVM more than 60% threshold
0.5714285714285714
Accuracy LR more than 60% threshold
0.6872037914691943


In [35]:
# Precision
from sklearn.metrics import precision_score
# For Support Vector Machine
print("Precision SVM more than 60% threshold")
print(precision_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
print("Precision LR more than 60% threshold")
print(precision_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


Precision SVM more than 60% threshold
[0.18181818 0.8       ]
Precision LR more than 60% threshold
[0.24444444 0.80722892]


In [37]:
# Recall
from sklearn.metrics import recall_score
print("Precision SVM more than 60% threshold")
print(recall_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Precision LR more than 60% threshold")
print(recall_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

Precision SVM more than 60% threshold
[0.34782609 0.625     ]
Precision LR more than 60% threshold
[0.25581395 0.79761905]
