# step 1: Predefined CLass

In [1]:
import random

class Propaganda:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self,sentence,SUBJprop):
        self.sentence = sentence
        self.SUBJprop = SUBJprop
        self.propaganda = self.get_propaganda()        

    def get_propaganda(self):
        if int(self.SUBJprop) <=3 :
            return Propaganda.NEGATIVE
        else:
            return Propaganda.POSITIVE


class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_sentence(self):
        return [x.sentence for x  in self.reviews]

    def get_propaganda(self):
        return [x.propaganda for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, self.reviews))
        negative_shrunk = negative[:len(positive)]
        self.reviews = positive + negative_shrunk
        random.shuffle(self.reviews)


# step 2: Load Data

In [2]:
import pandas as pd
import numpy as np
import nltk
import string
from string import digits

import nltk


reviews = []
data  = pd.read_excel('Data/finalDataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))


print("Total Rows:")
print(len(reviews))
print("Total Positive:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))))
print("Total Negative:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))))

# print(reviews[0].getSUBJprop)
# sentenceArray = [ x.sentence for x  in reviews] 

Total Rows:
16774
Total Positive:
3780
Total Negative:
12994


# step 2: Prep Data (split into train and test set)

In [3]:
from sklearn.model_selection import train_test_split

neg_prop = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))
pos_prop = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))

########################################################################################
#split trainig and DevTest dataset
neg_train, neg_devtest  = train_test_split(neg_prop , train_size=0.7, random_state = 42 )
pos_train, pos_devtest = train_test_split(pos_prop , train_size=0.7, random_state = 42 )
########################################################################################
#prepare training dataset
train = neg_train + pos_train
random.shuffle(train)
########################################################################################
#prepare development and test dataset
neg_dev, neg_test = train_test_split(neg_devtest , train_size=0.5, random_state = 42 )
pos_dev, pos_test = train_test_split(pos_devtest , train_size=0.5, random_state = 42 )

dev = neg_dev + pos_dev
random.shuffle(dev)

test = neg_test + pos_test
random.shuffle(test)
########################################################################################
print("Total Train:")
print(len(train))
print("Positive:")
print(len(pos_train))
print("Negative:")
print(len(neg_train))

print("\nTotal Development:")
print(len(dev))
print("Positive:")
print(len(pos_dev))
print("Negative:")
print(len(neg_dev))

print("\nTotal Test:")
print(len(test))
print("Positive:")
print(len(pos_test))
print("Negative:")
print(len(neg_test))

Total Train:
11741
Positive:
2646
Negative:
9095

Total Development:
2516
Positive:
567
Negative:
1949

Total Test:
2517
Positive:
567
Negative:
1950


# step 3: Seperate the attribute, originally our array has text and score. we want them to be a seperate array


In [4]:
train_container = ReviewContainer(train)
train_container.evenly_distribute()

train_sentence = train_container.get_sentence()   
train_propaganda = train_container.get_propaganda() 

development_sentence = [x.sentence for x in dev]
development_propaganda = [x.propaganda for x in dev]

test_sentence = [x.sentence for x in test]
test_propaganda = [x.propaganda for x in test]

print("Toalt rows after balance training dataset:")
print(len(train_sentence))

print("Positive Propaganda:")
print(train_propaganda.count(Propaganda.POSITIVE))

print("Negative Propaganda:")
print(train_propaganda.count(Propaganda.NEGATIVE))

Toalt rows after balance training dataset:
5292
Positive Propaganda:
2646
Negative Propaganda:
2646


# Cross Validation

In [5]:
# KFold cross validation
# Basic example (MORE INFO)

from sklearn.model_selection import KFold


kf=KFold(n_splits=3)
for train_index, test_index in kf.split(train_sentence):
    print(train_index, test_index)

[1764 1765 1766 ... 5289 5290 5291] [   0    1    2 ... 1761 1762 1763]
[   0    1    2 ... 5289 5290 5291] [1764 1765 1766 ... 3525 3526 3527]
[   0    1    2 ... 3525 3526 3527] [3528 3529 3530 ... 5289 5290 5291]


In [None]:

#StratifiedKFold is better when we have unbalanced data, it makes sure that in training there is sufficient for the smallest class
from sklearn.model_selection import StratifiedKFold 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


# KFold cross validation - on our dataset

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

folds = StratifiedKFold(n_splits=2)

scores_logistic = []
scores_svm = []
scores_rf = []
scores_nb = []

# passong training dataset to Cross Validation
Sample_Array_sentence = train_sentence
Sample_Array_propaganda = train_propaganda


# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Sample_Array_sentence_vectors = vectorizer.fit_transform(Sample_Array_sentence)


for train_index, test_index in folds.split(Sample_Array_sentence_vectors,Sample_Array_propaganda):
    X_train, X_test, y_train, y_test = Sample_Array_sentence_vectors[train_index], Sample_Array_sentence_vectors[test_index], \
                                       Sample_Array_propaganda[train_index], Sample_Array_propaganda[test_index]

    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb.append(get_score(GaussianNB(), X_train.toarray(), X_test.toarray(), y_train, y_test))


print("Score of Logistic Regression")
print(scores_logistic)
print("Score of SVM")
print(scores_svm)
print("Score of RandomForest")
print(scores_rf)
print("Score of Naive Bayes")
print(scores_nb)

In [None]:
# this is the same what we have done before but with the Sklearn Package
# from sklearn.model_selection import cross_val_score
# cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)
# cross_val_score(SVC(gamma='auto'), Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)
# cross_val_score(RandomForestClassifier(n_estimators=40),Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)

## step 4: Bag Of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_sentence) # return a matrix of either 0 or 1 # train_x is our text
test_x_vectors = vectorizer.transform(test_sentence)

# Todo:
# transform development set to vectors
development_x_vectors = vectorizer.fit_transform(development_sentence)
print(train_sentence[0])
print(train_x_vectors[2517].toarray())
print(len(train_x_vectors.toarray()))

print(test_x_vectors[0].toarray())
print(len(test_x_vectors.toarray()))

print(development_x_vectors[0].toarray())
print(len(development_x_vectors.toarray()))

As I write this, the outrage is doing the opposite of dying down, and is surely a sign of how fed up Catholics – even those who would never identify themselves as Traditionalists – have become with this pope and his cadre of episcopal bullies.
[[0 0 0 ... 0 0 0]]
5292
[[0 0 0 ... 0 0 0]]
2517
[[0 0 0 ... 0 0 0]]
2516


# step 5: Classification SVM

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel='rbf',C=1, probability=True)
clf_svm.fit(train_x_vectors, train_propaganda)
clf_svm.predict(test_x_vectors[1])

# SVMProbabilityPropaganda=[]


# i = 0
# while i < len(development_x_vectors.toarray()):
#   SVMProbabilityPropaganda.append(clf_svm.predict_proba(development_x_vectors[i]))
#   i += 1
# IndexError: row index (2517) out of range

# SVMProbabilityPropaganda.append(clf_svm.predict_proba(test_x_vectors[0]))


# print('the first item in array')
# print(SVMProbabilityPropaganda[0])
# print('the whole Array')
# print(SVMProbabilityPropaganda)

array(['NEGATIVE'], dtype='<U8')

In [None]:
# print(SVMProbabilityPropaganda[0])
# print(SVMProbabilityPropaganda[0][0])
# print(SVMProbabilityPropaganda[0][0][0])

# step 5: Decision Tree 

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_propaganda)

clf_dec.predict(test_x_vectors[1])

array(['NEGATIVE'], dtype='<U8')

# step 5: Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray() , train_propaganda)

clf_gnb.predict(test_x_vectors[1].toarray())

array(['NEGATIVE'], dtype='<U8')

# step 5: Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_propaganda)

clf_log.predict(test_x_vectors[1])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['NEGATIVE'], dtype='<U8')

# step 6: Evaluation

In [11]:
# Mean Accuracy
# For Support Vector Machine
print(clf_svm.score(test_x_vectors,test_propaganda))
# For Decision Tree
print(clf_dec.score(test_x_vectors,test_propaganda))
# For Decision Naive Bayes
print(clf_gnb.score(test_x_vectors.toarray(),test_propaganda))
# For Logistic Regression
print(clf_log.score(test_x_vectors,test_propaganda))

0.4819229241160111
0.5311879221295193
0.4489471593166468
0.5554231227651967


In [12]:
# F1 Scores
from sklearn.metrics import f1_score

# For Support Vector Machine
print(f1_score(test_propaganda, clf_svm.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# trash for negative and neutral labels

# For Support Decision Tree
print(f1_score(test_propaganda,clf_dec.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Naive Bayes
print(f1_score(test_propaganda,clf_gnb.predict(test_x_vectors.toarray()),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Logistic Regression
print(f1_score(test_propaganda,clf_log.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


[0.36015702 0.564753  ]
[0.34225195 0.63580247]
[0.37997318 0.50411155]
[0.37099494 0.6562212 ]


In [13]:
import numpy as np
# check number of positive and negative classes in training set
print("Number of NEGATIVE in training set:")
print(train_propaganda.count(Propaganda.NEGATIVE))
print("Number of POSITIVE in training set:")
print(train_propaganda.count(Propaganda.POSITIVE))


# check number of positive and negative classes in test set
y = np.array(test_propaganda)
number_of_NEGATIVE = (y == "NEGATIVE").sum()
number_of_POSITIVE = (y == "POSITIVE").sum()
print("\nNumber of NEGATIVE in test set:")
print(number_of_NEGATIVE)
print("Number of POSITIVE in test set:")
print(number_of_POSITIVE)

# check number of positive and negative classes in development set

y = np.array(development_propaganda)
Dnumber_of_NEGATIVE = (y == "NEGATIVE").sum()
Dnumber_of_POSITIVE = (y == "POSITIVE").sum()
print("\nNumber of NEGATIVE in development set:")
print(Dnumber_of_NEGATIVE)
print("Number of POSITIVE in development set:")
print(Dnumber_of_POSITIVE)


Number of NEGATIVE in training set:
2646
Number of POSITIVE in training set:
2646

Number of NEGATIVE in test set:
1950
Number of POSITIVE in test set:
567

Number of NEGATIVE in development set:
1949
Number of POSITIVE in development set:
567


# step 7: Tuning our model (with Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

# tuned = svm.SVC()
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32,64)}
svc = svm.SVC()
clf = GridSearchCV(svc,parameters,cv=10)
clf.fit(train_x_vectors,train_propaganda)
clf.cv_results_


In [None]:
import pandas as pd

df = pd.DataFrame(clf.cv_results_)
df

In [None]:
df[['param_C','param_kernel','mean_test_score']]

In [None]:
clf.best_params_