In [None]:
###IMPORTING STUFF###############################################################

In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import xgboost, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
####SETTING UP DATA SETS##############################################################

In [3]:
data = pd.read_csv('persuasionExamples6.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
data['containsPersuasion'] = np.where(data['containsPersuasion']=='[1]', 1, 0)
data = data.astype('U')
trainDF = pd.DataFrame()
trainDF['body'] = data['body']
trainDF['containsPersuasion'] = data['containsPersuasion']
data['containsPersuasion'].value_counts()

0    47303
1    30890
Name: containsPersuasion, dtype: int64

In [4]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['body'], trainDF['containsPersuasion'])
train_x = train_x.astype('U')
valid_x = valid_x.astype('U')
train_y = train_y.astype('U')
valid_y = valid_y.astype('U')

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
####Count vectorizer used for all 'CV' models#######################################

In [5]:
#Count Vectorizer!!
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['body'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)


In [None]:
####Word vectorizer used for all 'WV' models#######################################

In [6]:
#word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['body'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [None]:
####N-Gram vectorizer used for all 'NV' models#######################################

In [7]:
#ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['body'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [None]:
#####Creates embedded words? I copied this form the website linked in teams###################

In [8]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec', encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['body'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
        

In [None]:
###Following method takes in a classifier, and trains is against the given input/expected vectors#############

In [31]:
#Training
def train_model(classifier, feature_vector_train, label):#, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    return classifier
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
####Creates model using above function, notice which training sets are passed for which model#######

In [35]:
# Naive Bayes on Count Vectors
nbcv = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y)
predictions = nbcv.predict(xvalid_count)
print("NBCV: ", metrics.accuracy_score(predictions, valid_y))

# Naive Bayes on Word Level TF IDF Vectors
nbwv = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y)
predictions = nbwv.predict(xvalid_tfidf)
print("NBWV: ", metrics.accuracy_score(predictions, valid_y))

# Naive Bayes on Ngram Level TF IDF Vectors
nbnv = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y)
predictions = nbnv.predict(xvalid_tfidf_ngram)
print("NBNV: ", metrics.accuracy_score(predictions, valid_y))

NBCV:  0.7834160315105632
NBWV:  0.8750319709448053
NBNV:  0.8443910174433474


In [36]:
# Linear Classifier on Count Vectors
lrcv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_count, train_y)
predictions = lrcv.predict(xvalid_count)
print("LRCV: ", metrics.accuracy_score(predictions, valid_y))

# Linear Classifier on Word Level TF IDF Vectors
lrwv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_tfidf, train_y)
predictions = lrwv.predict(xvalid_tfidf)
print("LRWV: ", metrics.accuracy_score(predictions, valid_y))

# Linear Classifier on Ngram Level TF IDF Vectors
lrnv = train_model(linear_model.LogisticRegression(max_iter=1000000), xtrain_tfidf_ngram, train_y)
predictions = lrnv.predict(xvalid_tfidf_ngram)
print("LRNV: ", metrics.accuracy_score(predictions, valid_y))

LRCV:  0.9185636093917847
LRWV:  0.9133459511995499
LRNV:  0.8675124047265845


In [62]:
#MANUAL, now that we have the classifiers trained, we can pass in our own tests.

In [48]:
man_data = pd.read_csv('testSet3.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
man_x = man_data.body
man_y = man_data.containsPersuasion
#Have to use previous vectors.transform(man_x) to get right demensiosn.
man_x_cv = count_vect.transform(man_x)
man_x_wv = tfidf_vect.transform(man_x)
man_x_nv = tfidf_vect_ngram.transform(man_x)

In [49]:
predictions = nbcv.predict(man_x_cv)
print("NBCV: ", metrics.accuracy_score(predictions, man_y))

NBCV:  0.75


In [50]:
predictions = nbwv.predict(man_x_wv)
print("NBWV: ", metrics.accuracy_score(predictions, man_y))

NBWV:  0.625


In [51]:
predictions = nbnv.predict(man_x_nv)
print("NBNV: ", metrics.accuracy_score(predictions, man_y))

NBNV:  0.8333333333333334


In [52]:
predictions = lrcv.predict(man_x_cv)
print("LRCV: ", metrics.accuracy_score(predictions, man_y))

LRCV:  0.5833333333333334


In [53]:
predictions = lrwv.predict(man_x_wv)
print("LRWV: ", metrics.accuracy_score(predictions, man_y))

LRWV:  0.6666666666666666


In [54]:
predictions = lrnv.predict(man_x_nv)
print("LRNV: ", metrics.accuracy_score(predictions, man_y))

LRNV:  0.7083333333333334
