In [55]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import xgboost, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [56]:
data = pd.read_csv('persuasionExamples6.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
data['containsPersuasion'] = np.where(data['containsPersuasion']=='[1]', 1, 0)
trainDF = pd.DataFrame()
trainDF['body'] = data['body']
trainDF['containsPersuasion'] = data['containsPersuasion']
data['containsPersuasion'].value_counts()

0    47303
1    30890
Name: containsPersuasion, dtype: int64

In [57]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['body'], trainDF['containsPersuasion'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [58]:
#Count Vectorizer!!
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['body'].astype('U'))

xtrain_count =  count_vect.transform(train_x.astype('U'))
xvalid_count =  count_vect.transform(valid_x.astype('U'))


In [59]:
#word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['body'].astype('U'))
xtrain_tfidf =  tfidf_vect.transform(train_x.astype('U'))
xvalid_tfidf =  tfidf_vect.transform(valid_x.astype('U'))

In [60]:
#ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['body'].astype('U'))
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x.astype('U'))
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x.astype('U'))

In [None]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['body'].astype('U'))
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x.astype('U')), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x.astype('U')), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
        

In [39]:
#Training
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [52]:
man_data = pd.read_csv('testSet3.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
man_trainDF = pd.DataFrame()
man_trainDF['body'] = man_data['body']
man_trainDF['containsPersuasion'] = man_data['containsPersuasion']
man_data['containsPersuasion'].value_counts()

man_train_x, man_valid_x, man_train_y, man_valid_y = model_selection.train_test_split(man_trainDF['body'], man_trainDF['containsPersuasion'])

man_xtrain_count =  count_vect.transform(man_train_x.astype('U'))
man_xvalid_count =  count_vect.transform(man_valid_x.astype('U'))
                              

In [53]:
def man_train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [54]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

ValueError: Found input variables with inconsistent numbers of samples: [18, 58644]

In [None]:
# MANUAL
# Naive Bayes on Count Vectors
accuracy = man_train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = man_train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = man_train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)