In [None]:
#This is the full pipeline for the text anaylsis
#There are two main components of analysis.
    #1. Persuasion detection 
    #2. Analysis to classify arguments

In [1]:
#Import statements
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import xgboost, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
#Part 1: Persuasion detection
#In this part, we are using a classified machine learning algorithm. It is
#Trained on ~70k reddit posts/comments that were gathered using PRAW

In [None]:
#Setting up DataFrames

In [2]:
data = pd.read_csv('etc/persuasionExamples6.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
data['containsPersuasion'] = np.where(data['containsPersuasion']=='[1]', 1, 0)
data = data.astype('U')
trainDF = pd.DataFrame()
trainDF['body'] = data['body']
trainDF['containsPersuasion'] = data['containsPersuasion']
data['containsPersuasion'].value_counts()

0    47303
1    30890
Name: containsPersuasion, dtype: int64

In [3]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['body'], trainDF['containsPersuasion'])
train_x = train_x.astype('U')
valid_x = valid_x.astype('U')
train_y = train_y.astype('U')
valid_y = valid_y.astype('U')

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [4]:
#Setting up Count Vectorizer used in all 'XXCV' models.
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['body'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [5]:
#Setting up Word Vectorizer used in all 'XXWV' models.
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['body'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [6]:
#Setting up N-gram Vectorizer used in all 'XXNV' models.
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['body'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [7]:
#Training
def train_model(classifier, feature_vector_train, label):#, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    return classifier
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [8]:
#Creates model using above function, notice which training sets are passed for which model
# Naive Bayes on Count Vectors
nbcv = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y)
predictions = nbcv.predict(xvalid_count)
print("NBCV: ", metrics.accuracy_score(predictions, valid_y))

# Naive Bayes on Word Level TF IDF Vectors
nbwv = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y)
predictions = nbwv.predict(xvalid_tfidf)
print("NBWV: ", metrics.accuracy_score(predictions, valid_y))

# Naive Bayes on Ngram Level TF IDF Vectors
nbnv = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y)
predictions = nbnv.predict(xvalid_tfidf_ngram)
print("NBNV: ", metrics.accuracy_score(predictions, valid_y))

NBCV:  0.7810118164612001
NBWV:  0.8701723873343905
NBNV:  0.8388664381809812


In [9]:
# Linear Classifier on Count Vectors
lrcv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_count, train_y)
predictions = lrcv.predict(xvalid_count)
print("LRCV: ", metrics.accuracy_score(predictions, valid_y))

# Linear Classifier on Word Level TF IDF Vectors
lrwv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_tfidf, train_y)
predictions = lrwv.predict(xvalid_tfidf)
print("LRWV: ", metrics.accuracy_score(predictions, valid_y))

# Linear Classifier on Ngram Level TF IDF Vectors
lrnv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_tfidf_ngram, train_y)
predictions = lrnv.predict(xvalid_tfidf_ngram)
print("LRNV: ", metrics.accuracy_score(predictions, valid_y))

LRCV:  0.9116067318021382
LRWV:  0.9083329070540692
LRNV:  0.8654662642590414


In [27]:
#Manual testing, now that we have the classifiers trained, we can pass in our own tests.
man_data = pd.read_csv('etc/testSet3.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
man_x = man_data.body
man_y = man_data.containsPersuasion
#Have to use previous vectors.transform(man_x) to get right demensiosn.
man_x_cv = count_vect.transform(man_x)
man_x_wv = tfidf_vect.transform(man_x)
man_x_nv = tfidf_vect_ngram.transform(man_x)

In [28]:
predictions = nbcv.predict(man_x_cv)
print("NBCV: ", metrics.accuracy_score(predictions, man_y))

NBCV:  0.8


In [29]:
predictions = nbwv.predict(man_x_wv)
print("NBWV: ", metrics.accuracy_score(predictions, man_y))

NBWV:  0.64


In [30]:
predictions = nbnv.predict(man_x_nv)
print("NBNV: ", metrics.accuracy_score(predictions, man_y))

NBNV:  0.76


In [31]:
predictions = lrcv.predict(man_x_cv)
print("LRCV: ", metrics.accuracy_score(predictions, man_y))

LRCV:  0.64


In [32]:
predictions = lrwv.predict(man_x_wv)
print("LRWV: ", metrics.accuracy_score(predictions, man_y))

LRWV:  0.64


In [33]:
predictions = lrnv.predict(man_x_nv)
print("LRNV: ", metrics.accuracy_score(predictions, man_y))

LRNV:  0.66


In [34]:
#Printing off confusion matrix of a specific algorithm
#[is not persuasive and guessed right, is not persuasive but guessed wrong]
#[Is persuasive but guessed wrong, is persuasicve and guessed right]
from sklearn.metrics import confusion_matrix
predictions = nbnv.predict(man_x_nv)
confusion_matrix = confusion_matrix(man_y, predictions)
print(confusion_matrix)

[[22  3]
 [ 9 16]]


In [None]:
#Part two: Analysis to classify argument 
#In this part, we take all examples that were marked as persuasivem and do
#Further analysis on them to estimate the classification of argument (Between 
#Logos, Ethos, and Pathos). Originally we wanted to do this via an unsupervised
#algorithm, but have switched to a range of manual tests.