### Naive Bayes SVM

This model is implemented with the help of paper on sentiment classification - https://
nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

In [1]:
#!/usr/bin/env python3

import re
import string
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Some global declarations
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

In [12]:
def load_and_process_data():
    '''
    Function to load train and test data along 
    with handling of missing data
    '''
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')
    sample_submission = pd.read_csv('./data/sample_submission.csv')
    
    train['none'] = 1 - train[label_cols].max(axis=1)
    train['comment_text'].fillna("unknown", inplace=True)
    test['comment_text'].fillna("unknown", inplace=True)
    
    return train, test, sample_submission

In [4]:
def tokenize(s): 
    '''
    tokenizer for TFIDF
    '''
    return re_tok.sub(r' \1 ', s).split()

In [5]:
def get_vectorized_features(train, test):
    '''
    Convert both test and train comments to term-document 
    matrix using TFIDF technique.
    '''
    n = train.shape[0]
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, 
                          max_df=0.9, strip_accents='unicode', use_idf=1,
                          smooth_idf=1, sublinear_tf=1)
    train_term_doc = vec.fit_transform(train['comment_text'])
    test_term_doc = vec.transform(test['comment_text'])
    
    return train_term_doc, test_term_doc

In [6]:
def pr(x, y_i, y):
    '''
    Probability function for naive-bayes
    '''
    p = x[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

In [7]:
def get_model(x, y):
    '''
    SVM model is built on top of Naive Bayes features.
    Logistic Regression model is used on top of these features
    since it is almost identical to SVM.
    '''
    y = y.values
    r = np.log(pr(x, 1, y) / pr(x, 0, y))
    model = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return model.fit(x_nb, y), r

In [17]:
def main():
    train, test, sample_submission = load_and_process_data()
    x_train, x_test = get_vectorized_features(train, test)
    preds = np.zeros((len(test), len(label_cols)))
    
    for i, j in enumerate(label_cols):
        model, r = get_model(x_train, train[j])
        preds[:, i] = model.predict_proba(x_test.multiply(r))[:, 1]
        
    submid = pd.DataFrame({'id': sample_submission['id']})
    submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
    submission.to_csv('submission_NBSVM.csv', index=False)

In [18]:
if __name__ == '__main__':
    main()