In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

data_paths = {}
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths[filename] = os.path.join(dirname, filename)

- Best Model so far is the Logistic Regression (Baseline) with public LB score of 0.97519. The data is used for this model is not preprocessed/stemmed. I directly used tfidf vectorizer with `lower = True` and `stopwords = english`.

- I expected this model (Logistic - Baseline) performance to improve by preprocessing the comment text data by removing punctuations, stopwords, urls, htmls & converting emoticons/emojis...but results didn't improve & models under performed the above baseline by around 1-2%. This is probably because we are loosing some information by excessive preprocessing. The results for this models are present in this [Notebook](https://www.kaggle.com/asrsaiteja/toxic-comments-preprocessing-baselines)

- Now In this notebook,I am trying the Logistic model to > 0.98. I am not going to use any fancy preprocessing as they are helping us, instead I am sticking to tfidf preprocessing parameters `lower = True` and `stopwords = english` for now.

- Further I am planning to try kfold cross validation to see if it can improve the LB score.

In [2]:
train_df = pd.read_csv(data_paths['train.csv'])
test_df = pd.read_csv(data_paths['test.csv'])
sub_df = pd.read_csv(data_paths['sample_submission.csv'])
print('Train data shape:', train_df.shape)
print('Columns in Train:', train_df.columns)

Train data shape: (159571, 8)
Columns in Train: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


In [3]:
drop_col = ['id', 'is_clean']  # columns not neccessary - can be dropped
text_col = ['comment_text']  # text feature
label_col = [col for col in train_df.columns if col not in text_col + drop_col] # target variables

In [4]:
from sklearn.model_selection import train_test_split

# For SUBMISSION
X_train_val = train_df['comment_text']
y_train_val = train_df[label_col]

X_test = test_df['comment_text']
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, 
                                                  test_size=0.2, random_state=2019)
print('Data points in train data after splitting:', len(X_train))
print('Data points in valiadtion data:', len(X_val))
print('Data points in test data:', len(X_test))

del train_df, test_df

Data points in train data after splitting: 127656
Data points in valiadtion data: 31915
Data points in test data: 153164


In [5]:
# wrapper for mean roc auc metric
from sklearn.metrics import roc_auc_score
def mean_roc_auc(y_true, y_pred):
    num_class = 6
    return np.mean([roc_auc_score(y_true[:,i], y_pred[:,i]) for i in range(num_class)]) 

### feature transformation (tfidf - text feature extraction)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from scipy.sparse import hstack
from nltk import pos_tag

# word features
tfidf_words = TfidfVectorizer(ngram_range = (1,1), strip_accents='unicode', analyzer='word',
                              smooth_idf=1, sublinear_tf=1,stop_words = 'english', max_features = 20000)
X_train_words = tfidf_words.fit_transform(X_train)
X_val_words = tfidf_words.transform(X_val)

# char features
tfidf_char = TfidfVectorizer(ngram_range = (2,6), strip_accents='unicode', analyzer='char', 
                              smooth_idf=1, sublinear_tf=1,stop_words = 'english', max_features = 50000)
X_train_char = tfidf_char.fit_transform(X_train)
X_val_char = tfidf_char.transform(X_val)

# stack in
train_features = hstack([X_train_words, X_train_char]).tocsr()
val_features = hstack([X_val_words, X_val_char]).tocsr()

print(train_features.shape, val_features.shape)
del X_train_words, X_val_words, X_train_char, X_val_char

(127656, 70000) (31915, 70000)


#### Train + Val features & Test features (for making submissions)

In [7]:
# transform into features
X_train_val_words = tfidf_words.fit_transform(X_train_val)
X_test_words = tfidf_words.transform(X_test)

X_train_val_char = tfidf_char.fit_transform(X_train_val)
X_test_char = tfidf_char.transform(X_test)

train_val_features = hstack([X_train_val_words, X_train_val_char]).tocsr()
test_features = hstack([X_test_words, X_test_char]).tocsr()

print(train_val_features.shape, test_features.shape)
del X_train_val_words, X_test_words, X_train_val_char, X_test_char

(159571, 70000) (153164, 70000)


### Logisitc Regression

In [8]:
model = OneVsRestClassifier(LogisticRegression(solver='sag'))  # default C = 1.0
model.fit(train_features, y_train)
print('model: Logistic Regression')
print('mean ROC-AUC on train set:', roc_auc_score(y_train, model.predict_proba(train_features)))
y_pred_log = model.predict_proba(val_features)
print('mean ROC-AUC on validation set:', roc_auc_score(y_val, y_pred_log))

model: Logistic Regression
mean ROC-AUC on train set: 0.9955191025106922
mean ROC-AUC on validation set: 0.984386798576792


- The above logisitc model with tfidf words + tfidf char features scored 0.98003 on the private Leaderboard and 0.97897 on public leaderboard
- To my models built on minimal preprocessed data (only lowering & stopwords removal) has performed better than the models built on preprocessed comment data.
- I observed wide variations of ROC-AUC scores with change in tfidf parameters - especially min_df, max_df and max_features. I tuned these a bit manually which helped to reduce the model complexity using `max_features` parameter. 

### Tuning Logisitc Regression

In [9]:
def train_validate_model(model):
    
    model.fit(train_features, y_train)
    train_mean_roc = roc_auc_score(y_train, model.predict_proba(train_features))
    print('mean ROC-AUC on train set:', train_mean_roc)
    y_pred_log = model.predict_proba(val_features)
    val_mean_roc = roc_auc_score(y_val, y_pred_log)
    print('mean ROC-AUC on validation set:', val_mean_roc)
    
    return train_mean_roc, val_mean_roc

In [10]:
print('Logistic Regression - Fine Tuning')
scores = []
for c_value in [0.001, 0.01, 0.1, 1, 2, 3, 5, 7, 10, 25, 60, 100, 1000]:
    print('\nC:=', c_value)
    model = OneVsRestClassifier(LogisticRegression(C = c_value, solver='sag'))
    _, val_roc = train_validate_model(model)
    scores.append((c_value, val_roc))

Logistic Regression - Fine Tuning

C:= 0.001
mean ROC-AUC on train set: 0.963532670494819
mean ROC-AUC on validation set: 0.9592854059846184

C:= 0.01
mean ROC-AUC on train set: 0.9719240033748707
mean ROC-AUC on validation set: 0.9668758008463559

C:= 0.1
mean ROC-AUC on train set: 0.9853337365045296
mean ROC-AUC on validation set: 0.9786513010413289

C:= 1
mean ROC-AUC on train set: 0.9955191082732129
mean ROC-AUC on validation set: 0.9843868196166347

C:= 2
mean ROC-AUC on train set: 0.997412244665699
mean ROC-AUC on validation set: 0.9841449112138427

C:= 3
mean ROC-AUC on train set: 0.99824458668684
mean ROC-AUC on validation set: 0.9835529293358425

C:= 5
mean ROC-AUC on train set: 0.9990087933155457
mean ROC-AUC on validation set: 0.9823872595317367

C:= 7
mean ROC-AUC on train set: 0.9993576262765776
mean ROC-AUC on validation set: 0.9813902833154792

C:= 10
mean ROC-AUC on train set: 0.9996161280259827
mean ROC-AUC on validation set: 0.9801972388424326

C:= 25
mean ROC-AUC on 

In [11]:
scores_df = pd.DataFrame(scores, columns = ['c_value', 'val_roc'])
best_c_value = scores_df.loc[scores_df['val_roc'].idxmax(), 'c_value']
print('Best C value from finetuning:', best_c_value)

Best C value from finetuning: 1.0


In [12]:
# define & fit the best logistic model
model = OneVsRestClassifier(LogisticRegression(C = best_c_value,solver='sag'))
model.fit(train_val_features, y_train_val)
print('model: Tuned Logistic Regression')

# make predictions on Test data
y_test_pred = model.predict_proba(test_features)
## making a submission file
sub_df.iloc[:,1:] = y_test_pred
sub_df.head()
from IPython.display import FileLink
sub_df.to_csv('logistic_submission.csv', index = None)
FileLink('logistic_submission.csv')

model: Tuned Logistic Regression


- The best C value from finetuning and default C value are same (C = 1.0). So the Tuned Logistic model will also have same test scores of 0.98003 on the private Leaderboard and 0.97897 on public leaderboard.

### using a Regressor: Following this method known to improve the LB score slightly (from kaggle discussions)

- As the evaluation is based on the class 'probability scores'...Let's try some thing different: Using a Regressor to predict the score (Like Logistic, Lasso or Ridge). Later if needed (if values are out of bounds) we can just apply simple `sigmoid` function over the output to squash the values b/w 0 & 1.

- In my experients, Ridge worked better... It is nothing but Logistic Regression with L2 regularization. (where as Lasso is with L1 regularization)

In [13]:
from sklearn.linear_model import Ridge

y_val_pred = np.zeros(y_val.shape)
for i, class_name in enumerate(label_col):
    train_target = y_train[class_name]
    classifier = Ridge(alpha=20, copy_X=True, solver='auto',tol=0.0025)
    classifier.fit(train_features, train_target)
    y_val_pred[:,i] = classifier.predict(val_features)
print('mean ROC-AUC on validation set:', roc_auc_score(y_val, y_val_pred))

mean ROC-AUC on validation set: 0.9846564408459272


In [14]:
# test submission
print('model: Ridge Regression')
y_test_pred = np.zeros((len(X_test), y_val.shape[1]))
for i, class_name in enumerate(label_col):
    train_val_target = y_train_val[class_name]
    clf = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto',
                        max_iter=100,   normalize=False, random_state=0,  tol=0.0025)
    clf.fit(train_val_features, train_val_target)
    y_test_pred[:,i] = clf.predict(test_features)

## making a submission file
sub_df.iloc[:,1:] = y_test_pred
sub_df.head()
from IPython.display import FileLink
sub_df.to_csv('ridge_submission.csv', index = None)
FileLink('ridge_submission.csv')

model: Ridge Regression


- The above submission gave me a public LB score of 0.9809 on test data.

### NB Logisitc:
- There is this interesting paper which combines Naive Bayes & SVM - (NBSVM model)
- Baselines and Bigrams: Simple, Good Sentiment and Topic Classiﬁcation.
- The paper used the log probability ratio of features multiplied with original features as input to the liblinear version of SVM (=~ sklearn LinearSVM) and obtained better results on text classification task.
- Let us implement this paper by replacing SVM with Logistic (As logistic is performing best in this case I am swaping them.)

reference:
- [Baselines and Bigrams: Simple, Good Sentiment and Topic Classiﬁcation.](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf)
- [Jeremy Howard's Strong linear baseline notebook](https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline)

In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

class NbLogisticClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        
    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        # calculate prior
        def pr(x, y_i,y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        # calculate ratio & matrix multiply X with ratio
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        
        # fit the logisitic with NB features
        self._clf = LogisticRegression(C=self.C, dual=self.dual, 
                                       n_jobs=self.n_jobs).fit(x_nb, y)
        return self

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

In [16]:
nblog_clf = NbLogisticClassifier(C=1.0, dual=True)

y_val_pred = np.zeros(y_val.shape)
for i, class_name in enumerate(label_col):
    train_target = y_train[class_name]
    nblog_clf.fit(train_features, train_target)
    y_val_pred[:,i] = nblog_clf.predict_proba(val_features)[:,1]
print('mean ROC-AUC on validation set:', roc_auc_score(y_val, y_val_pred))

mean ROC-AUC on validation set: 0.9845241792468226


In [17]:
# test submission
print('model: NB-Logisitc')

y_test_pred = np.zeros((len(X_test), y_val.shape[1]))
for i, class_name in enumerate(label_col):
    # define target
    train_val_target = y_train_val[class_name]
    # define clf
    nblog_clf = NbLogisticClassifier(C=1.0, dual=True)
    # fit the clf
    nblog_clf.fit(train_val_features, train_val_target)
    # make pred on test
    y_test_pred[:,i] = nblog_clf.predict_proba(test_features)[:,1]

## making a submission file
sub_df.iloc[:,1:] = y_test_pred
sub_df.head()
from IPython.display import FileLink
sub_df.to_csv('nblog_submission.csv', index = None)
FileLink('nblog_submission.csv')

model: NB-Logisitc


In [18]:
# # number of folds
# num_folds = 2 

# predict = np.zeros((test_df.shape[0],6))

# # Uncomment for out-of-fold predictions
# #scores = []
# #oof_predict = np.zeros((train.shape[0],6))

# kf = KFold(n_splits = num_folds, shuffle=True, random_state=2019)

# for train_index, test_index in kf.split(X_train_val):
    
#     kfold_y_train, kfold_y_test = y_train[train_index], y_train[test_index]
#     kfold_X_train = x_train[train_index]
#     kfold_X_features = features[train_index]
#     kfold_X_valid = x_train[test_index]
#     kfold_X_valid_features = features[test_index]
    
#     gc.collect()
#     K.clear_session()
    
#     model = get_model(features)
    
#     ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    
#     model.fit([kfold_X_train,kfold_X_features], kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
#              callbacks = [ra_val])
#     gc.collect()
    
#     #model.load_weights(bst_model_path)
#     model.load_weights("best_weights.h5")
    
#     predict += model.predict([x_test,test_features], batch_size=batch_size,verbose=1) / num_folds
    
#     #gc.collect()
#     # uncomment for out of fold predictions
#     #oof_predict[test_index] = model.predict([kfold_X_valid, kfold_X_valid_features],batch_size=batch_size, verbose=1)
#     #cv_score = roc_auc_score(kfold_y_test, oof_predict[test_index])
    
#     #scores.append(cv_score)
#     #print('score: ',cv_score)

# print("Done")
# #print('Total CV score is {}'.format(np.mean(scores)))    


# sample_submission = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
# class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# sample_submission[class_names] = predict
# sample_submission.to_csv('model_9872_baseline_submission.csv',index=False)

# # uncomment for out of fold predictions
# #oof = pd.DataFrame.from_dict({'id': train['id']})
# #for c in class_names:
# #    oof[c] = np.zeros(len(train))
# #    
# #oof[class_names] = oof_predict
# #for c in class_names:
# #    oof['prediction_' +c] = oof[c]
# #oof.to_csv('oof-model_9872_baseline_submission.csv', index=False)