In [27]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [18]:
wv = gensim.models.KeyedVectors.load_word2vec_format("model/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logger.warning("cannot compute similarity with no input %s", words)
        # TODO: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [22]:
np.random.seed(500)

df = pd.read_csv('model/moral-data.csv', sep=';')
df.drop(['deontic_modality', 'type'], axis=1)
df.rename(columns={'general_rule': 'labels'}, inplace=True)

# Step - a : Remove blank rows if any.
df['text'].dropna(inplace=True)

# Step - b : Change all the text to lower case
df['text'] = [entry.lower() for entry in df['text']]

train, test = model_selection.train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  


In [26]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['labels'])
predictions_lr = logreg.predict(X_test_word_average)

print("Logistic Regression Accuracy Score -> ", accuracy_score(predictions_lr, test.labels)*100)

accuracy 0.8095238095238095




In [None]:
nb = MultinomialNB()
nb = nb.fit(X_train_word_average, train['labels'])
predictions_nb = nb.predict(test_X)

print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_nb, test_Y)*100)