In [23]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [32]:
# Preprocessing
np.random.seed(500)

df_corpus = pd.read_csv('model/moral-data.csv', sep=';')
df_corpus.drop(['deontic_modality', 'type'], axis=1)
df_corpus.rename(columns={'general_rule': 'labels'}, inplace=True)

# Step - a : Remove blank rows if any.
df_corpus['text'].dropna(inplace=True)

# Step - b : Change all the text to lower case
df_corpus['text'] = [entry.lower() for entry in df_corpus['text']]

# Step - c : Tokenization
df_corpus['text'] = [word_tokenize(entry) for entry in df_corpus['text']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(df_corpus['text']):
    final_words = []
    lemmatizer = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            final_word = lemmatizer.lemmatize(word, tag_map[tag[0]])
            final_words.append(final_word)
    df_corpus.loc[index,'text_processed'] = str(final_words)

In [33]:
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(df_corpus['text_processed'],df_corpus['labels'],test_size=0.2)

## Naive Bayes Classifier

In [34]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(train_X, train_Y)
predictions_nb = nb.predict(test_X)

print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_nb, test_Y)*100)

Naive Bayes Accuracy Score ->  77.77777777777779


## Support Vector Machine

In [35]:
svm = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)),
               ])
svm.fit(train_X, train_Y)
predictions_svm = svm.predict(test_X)

print("SVM Accuracy Score -> ", accuracy_score(predictions_svm, test_Y)*100)

SVM Accuracy Score ->  83.33333333333334


## Stochastic Gradient Descent

In [36]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))
               ])
# other loss functions: log, hinge
sgd.fit(train_X, train_Y)
predictions_sgd = sgd.predict(test_X)

print("SGD Accuracy Score -> ", accuracy_score(predictions_sgd, test_Y)*100)

SGD Accuracy Score ->  77.77777777777779


## Logistic Regression

In [37]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(train_X, train_Y)
predictions_lr = logreg.predict(test_X)

print("Logistic Regression Accuracy Score -> ", accuracy_score(predictions_lr, test_Y)*100)

Logistic Regression Accuracy Score ->  88.88888888888889




In [40]:
def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    
    final_words = []
    lemmatizer = WordNetLemmatizer()
    for word, tag in pos_tag(text):
        if word not in stopwords.words('english') and word.isalpha():
            final_word = lemmatizer.lemmatize(word, tag_map[tag[0]])
            final_words.append(final_word)
    return str(final_words)

def predict_class(sentence):    
    text = [preprocess(sentence)]
    return svm.predict(text)[0], max(svm.predict_proba(text)[0])

predict_class('lying')

(0, 0.6968489685754431)