# Install spacy and it's predefined model

In [None]:
pip install spacy
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz


Model Building

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

reviews = []
data  = pd.read_excel('~/anaconda3/MasterProject/Dataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))
sentenceArray = [ x.sentence for x  in reviews] 
print(sentenceArray)
print(len(sentenceArray))

In [None]:
sentence_vector =[]
for x in sentenceArray:
    sentence_vector.append(get_vec(x))
sentence_vector[0]


In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(x)
def get_vec(x):
      doc = nlp(x)
      vec = doc.vector
      return vec


# step 1: Predefined CLass

In [2]:
import random

class Propaganda:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self,sentence,SUBJprop):
        self.sentence = sentence
        self.SUBJprop = SUBJprop
        self.propaganda = self.get_propaganda()        

    def get_propaganda(self):
        if int(self.SUBJprop) <=3 :
            return Propaganda.NEGATIVE
        else:
            return Propaganda.POSITIVE


class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_sentence(self):
        return [x.sentence for x  in self.reviews]

    def get_propaganda(self):
        return [x.propaganda for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, self.reviews))
        negative_shrunk = negative[:len(positive)]
        self.reviews = positive + negative_shrunk
        random.shuffle(self.reviews)


# step 2: Load Data

In [3]:
import pandas as pd
import numpy as np
import nltk
import string
from string import digits

import nltk


reviews = []
data  = pd.read_excel('~/anaconda3/MasterProject/Dataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))


print("Total Rows:")
print(len(reviews))
print("Total Positive:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))))
print("Total Negative:")
print(len(list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))))

print(reviews[0].sentence)
# sentenceArray = [ x.sentence for x  in reviews] 

Total Rows:
3302
Total Positive:
105
Total Negative:
3197
miley and liam fighting false rumors swirl that theyre in a feud over a supposed prenup


# step 2: Prep Data (split into train and test set)

In [12]:
from sklearn.model_selection import train_test_split

neg_prop = list(filter(lambda x: x.propaganda == Propaganda.NEGATIVE, reviews))
pos_prop = list(filter(lambda x: x.propaganda == Propaganda.POSITIVE, reviews))

########################################################################################
#split trainig and DevTest dataset
neg_train, neg_devtest  = train_test_split(neg_prop , train_size=0.7, random_state = 42 )
pos_train, pos_devtest = train_test_split(pos_prop , train_size=0.7, random_state = 42 )
########################################################################################
#prepare training dataset
train = neg_train + pos_train
random.shuffle(train)
########################################################################################
#prepare development and test dataset
neg_dev, neg_test = train_test_split(neg_devtest , train_size=0.5, random_state = 42 )
pos_dev, pos_test = train_test_split(pos_devtest , train_size=0.5, random_state = 42 )

dev = neg_dev + pos_dev
random.shuffle(dev)

test = neg_test + pos_test
random.shuffle(test)
########################################################################################
print("Total Train:")
print(len(train))
print("Positive:")
print(len(pos_train))
print("Negative:")
print(len(neg_train))

print("\nTotal Development:")
print(len(dev))
print("Positive:")
print(len(pos_dev))
print("Negative:")
print(len(neg_dev))

print("\nTotal Test:")
print(len(test))
print("Positive:")
print(len(pos_test))
print("Negative:")
print(len(neg_test))

Total Train:
11741
Positive:
2646
Negative:
9095

Total Development:
2516
Positive:
567
Negative:
1949

Total Test:
2517
Positive:
567
Negative:
1950


# step 3: Seperate the attribute, originally our array has text and score. we want them to be a seperate array


In [7]:
train_container = ReviewContainer(train)
train_container.evenly_distribute()

train_sentence = train_container.get_sentence()   
train_propaganda = train_container.get_propaganda() 

development_sentence = [x.sentence for x in dev]
development_propaganda = [x.propaganda for x in dev]

test_sentence = [x.sentence for x in test]
test_propaganda = [x.propaganda for x in test]

print("Total rows after balance training dataset:")
print(len(train_sentence))

print("Positive Propaganda:")
print(train_propaganda.count(Propaganda.POSITIVE))

print("Negative Propaganda:")
print(train_propaganda.count(Propaganda.NEGATIVE))



Total rows after balance training dataset:
5292
Positive Propaganda:
2646
Negative Propaganda:
2646


# Cross Validation

In [None]:
# KFold cross validation
# Basic example (MORE INFO)

from sklearn.model_selection import KFold


kf=KFold(n_splits=3)
for train_index, test_index in kf.split(train_sentence):
    print(train_index, test_index)

In [6]:

#StratifiedKFold is better when we have unbalanced data, it makes sure that in training there is sufficient for the smallest class
from sklearn.model_selection import StratifiedKFold 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# import spacy
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# def get_vec(x):
#       doc = nlp(x)
#       vec = doc.vector
#       return vec

# KFold cross validation - on our dataset

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

folds = StratifiedKFold(n_splits=2)

scores_logistic = []
scores_svm = []
scores_rf = []
scores_nb = []
Sample_Array_sentence_vectors=[]
Sample_Array_sentence = np.concatenate((train_sentence, development_sentence))
Sample_Array_propaganda = np.concatenate((train_propaganda, development_propaganda))
Sample_Array_sentence_Array =Sample_Array_sentence.tolist()
print(type(Sample_Array_sentence_Array))

#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Sample_Array_sentence_vectors = vectorizer.fit_transform(Sample_Array_sentence)


# # Word2Vec
# for x in Sample_Array_sentence_Array:
#     Sample_Array_sentence_vectors.append(get_vec(x))


for train_index, test_index in folds.split(Sample_Array_sentence_vectors,Sample_Array_propaganda):
    X_train, X_test, y_train, y_test = Sample_Array_sentence_vectors[train_index], Sample_Array_sentence_vectors[test_index], \
                                       Sample_Array_propaganda[train_index], Sample_Array_propaganda[test_index]

    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb.append(get_score(GaussianNB(), X_train.toarray(), X_test.toarray(), y_train, y_test))


print("Score of Logistic Regression")
print(scores_logistic)
print("Score of SVM")
print(scores_svm)
print("Score of RandomForest")
print(scores_rf)
print("Score of Naive Bayes")
print(scores_nb)

<class 'list'>
Score of Logistic Regression
[0.5783811475409836, 0.5655737704918032]
Score of SVM
[0.5883709016393442, 0.5886270491803278]
Score of RandomForest
[0.5868340163934426, 0.5750512295081968]
Score of Naive Bayes
[0.5258709016393442, 0.5317622950819673]


In [None]:
# this is the same what we have done before but with the Sklearn Package
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)
cross_val_score(SVC(gamma='auto'), Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)
cross_val_score(RandomForestClassifier(n_estimators=40),Sample_Array_sentence_vectors, Sample_Array_propaganda,cv=10)

## step 2/5 NLTK with independent variable 

In [None]:
# word tokenize
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in sentenceArray_final]

print(tokenized_docs[1])
print("#######################################################################################")

#Sentence tokenization

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in sentenceArray_final]
print(sent_token[1])

In [None]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

In [None]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

## step 4: Bag Of Words

In [8]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# train_x_vectors = vectorizer.fit_transform(train_sentence) # return a matrix of either 0 or 1 # train_x is our text
# test_x_vectors = vectorizer.transform(test_sentence)

# # Todo:
# # transform development set to vectors
# development_x_vectors = vectorizer.fit_transform(development_sentence)
# print(train_sentence[0])
# print(train_x_vectors[0].toarray())
# print(test_x_vectors[0].toarray())
# print(development_x_vectors[0].toarray())
## spacy word2vec
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def get_vec(x):
      doc = nlp(x)
      vec = doc.vector
      return vec
# Word2Vec
train_x_vectors = []
for x in train_sentence:
     train_x_vectors.append(get_vec(x))

Step 4: Word2Vec Gensim

In [None]:
from gensim.models import Word2Vec,keyedvectors
import gensim
import pandas as pd
import nltk

clean_sentence = []
for sentence in sentenceArray:
    clean_sentence.append(gensim.utils.simple_preprocess(sentence))
print(clean_sentence[3])

# min_count: consider a word as input if it occurs minimun 5 times
# vector_size: means number of features a word capture like cat [] has feature animal 0.9 and furniture 0.01

model = Word2Vec(clean_sentence,vector_size=150,window=10,min_count=5,workers=10)
model.train(clean_sentence,total_examples=len(clean_sentence),epochs=10)
print(model)


In [None]:
vector = model.wv['wedding']
sims = model.wv.most_similar('wedding', topn=10)
print(sims)
# drawbacks on traing on own dataset: model has been triain on these words only, do not know
# other words, hard to get meaningful result from your own model

# step 5: Classification SVM

In [10]:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
## svm
clf_svm = svm.SVC(kernel='rbf',C=1)
clf_svm.fit(train_x_vectors, train_propaganda)
#clf_svm.predict(test_x_vectors[11])

## decision tree
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_propaganda)
#clf_dec.predict(test_x_vectors[1])



DecisionTreeClassifier()

# step 5: Decision Tree 

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# clf_dec = DecisionTreeClassifier()
# clf_dec.fit(train_x_vectors, train_propaganda)

# clf_dec.predict(test_x_vectors[1])

# step 5: Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray() , train_propaganda)

clf_gnb.predict(test_x_vectors[1].toarray())

# step 5: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_propaganda)

clf_log.predict(test_x_vectors[1])

# step 5: Chisquare

In [9]:
from scipy.stats import chisquare

chi = []

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = int(row['SUBJprop'])
    chi.append(prop)

print(chisquare(chi))

Power_divergenceResult(statistic=1900.072243346008, pvalue=1.0)


# step 6: Evaluation

In [11]:
# Mean Accuracy
# # For Support Vector Machine
print(clf_svm.score(test_x_vectors,test_propaganda))
# For Decision Tree
print(clf_dec.score(test_x_vectors,test_propaganda))
# For Decision Naive Bayes
# print(clf_gnb.score(test_x_vectors.toarray(),test_propaganda))
# # For Logistic Regression
# print(clf_log.score(test_x_vectors,test_propaganda))

NameError: name 'test_x_vectors' is not defined

In [None]:
# F1 Scores
from sklearn.metrics import f1_score

# For Support Vector Machine
print(f1_score(test_propaganda, clf_svm.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# trash for negative and neutral labels

# For Support Decision Tree
print(f1_score(test_propaganda,clf_dec.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Naive Bayes
print(f1_score(test_propaganda,clf_gnb.predict(test_x_vectors.toarray()),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Logistic Regression
print(f1_score(test_propaganda,clf_log.predict(test_x_vectors),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


In [None]:
import numpy as np
# check number of positive and negative classes in training set
print("Number of NEGATIVE in training set:")
print(train_propaganda.count(Propaganda.NEGATIVE))
print("Number of POSITIVE in training set:")
print(train_propaganda.count(Propaganda.POSITIVE))


# check number of positive and negative classes in test set
y = np.array(test_propaganda)
number_of_NEGATIVE = (y == "NEGATIVE").sum()
number_of_POSITIVE = (y == "POSITIVE").sum()
print("\nNumber of NEGATIVE in test set:")
print(number_of_NEGATIVE)
print("Number of POSITIVE in test set:")
print(number_of_POSITIVE)

# check number of positive and negative classes in development set

y = np.array(development_propaganda)
Dnumber_of_NEGATIVE = (y == "NEGATIVE").sum()
Dnumber_of_POSITIVE = (y == "POSITIVE").sum()
print("\nNumber of NEGATIVE in development set:")
print(Dnumber_of_NEGATIVE)
print("Number of POSITIVE in development set:")
print(Dnumber_of_POSITIVE)


In [None]:
test_set =['this is advertisement','bad book do not buy','horrible waste of time']
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

# step 7: Tuning our model (with Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

# tuned = svm.SVC()
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32,64)}
svc = svm.SVC()
clf = GridSearchCV(svc,parameters,cv=10)
clf.fit(train_x_vectors,train_propaganda)
clf.cv_results_


In [None]:
import pandas as pd

df = pd.DataFrame(clf.cv_results_)
df

In [None]:
df[['param_C','param_kernel','mean_test_score']]

In [None]:
clf.best_params_