# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [1]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./input/train.csv').fillna('Unknown')
test = pd.read_csv('./input/test.csv').fillna('Unknown')

Стадартными подходами для анализа текста являются [Bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) и его модификация [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).

Они реалзованны в `sklearn` в виде [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) и [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

Более подробней про них можно посмотреть [тут](https://github.com/udsclub/workshop/blob/master/notebooks/UDS-workshop-feature-extraction-and-engineering.ipynb)

In [3]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [4]:
import re, string
from nltk.corpus import stopwords
from keras.preprocessing.text import text_to_word_sequence

re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

# def cleanupDoc(s):
#     s = clean_text(s)
#     stopset = set(stopwords.words('english'))
#     stopset.add('wikipedia')
#     tokens =sequence=text_to_word_sequence(s, 
#                                            filters="\"!'#$%&()*+,-˚˙./:;‘“<=·>?@[]^_`{|}~\t\n",
#                                            lower=True,
#                                            split=" ")
#     cleanup = " ".join(filter(lambda word: word not in stopset, tokens))
#     return cleanup

def tokenize(s): 
    return re_tok.sub(r' \1 ', clean_text(s)).split()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
# Попробуйте разные Vectorizer и разные размеры n-gramm, стоп-слова, обрезку редких слов, обрезку слишком частых слов
word_vectorizer = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 2),
                                  tokenizer=tokenize,
                                  #stop_words='english',
                                  max_df=0.9,
                                  min_df=3,
                                  strip_accents='unicode', 
                                  use_idf=True,
                                  smooth_idf=True, 
                                  sublinear_tf=True,
                                  max_features=300000)

char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  smooth_idf=True,
                                  tokenizer=tokenize,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  max_df=0.9,
                                  min_df=3,
                                  ngram_range=(1, 4),
                                  max_features=300000)

#vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [6]:
train_word_features = sparse.hstack([train_char_features, train_word_features])
test_word_features = sparse.hstack([test_char_features, test_word_features])

In [7]:
import pickle
with open('data.pkl', 'wb') as f:
    pickle.dump([train_word_features, test_word_features], f)

In [None]:
import pickle
with open('data.pkl', 'rb') as f:
    train_word_features, test_word_features = pickle.load(f)

Опубликуйте лучшие решение на [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/submit)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, 
                 C=3.15, 
                 dual=False, 
                 solver='newton-cg', 
                 max_iter=1000,
                 tol=0.00001,
                 n_jobs=1):
        
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        self.max_iter = max_iter
        self.solver = solver
        self.tol = tol

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, 
                                       dual=self.dual,
                                       class_weight='balanced',
                                       solver=self.solver, 
                                       max_iter=self.max_iter,
                                       tol=self.tol,
                                       n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [9]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

# CV

In [10]:
from sklearn.model_selection import KFold, GridSearchCV
from tqdm import tqdm

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=0xCAFFE)

In [12]:
n_jobs = 30

for class_name in class_names[1:]:
    print(class_name)
    params = {
        'C': [0.25, 0.27, 0.30, 0.32, 0.35, 0.36]
    }

    gs = GridSearchCV(
        estimator=NbSvmClassifier(), 
        param_grid=params,
        cv=kf,
        error_score=1,
        scoring='roc_auc',
        n_jobs=n_jobs,
        verbose=1,
    )

    # perform grid search on TRAIN dataset ('is_train' filtering)
    gs.fit(
        X=train_word_features,
        y=np.array(train[class_name]),
    )
    
    best_score = gs.best_score_
    best_estimator = gs.best_estimator_
    print('ROC-AUC best: {:.4f}'.format(best_score))
    print(best_estimator)

severe_toxic
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=30)]: Done   2 out of  30 | elapsed:  7.8min remaining: 108.9min
[Parallel(n_jobs=30)]: Done  30 out of  30 | elapsed: 12.0min finished


ROC-AUC best: 0.9895
NbSvmClassifier(C=0.25, dual=False, max_iter=1000, n_jobs=1,
        solver='newton-cg', tol=1e-05)
obscene
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Process ForkPoolWorker-57:
Process ForkPoolWorker-44:
Process ForkPoolWorker-55:
Process ForkPoolWorker-53:
Process ForkPoolWorker-47:
Process ForkPoolWorker-59:
Process ForkPoolWorker-46:
Process ForkPoolWorker-51:
Process ForkPoolWorker-52:
Process ForkPoolWorker-60:
Process ForkPoolWorker-54:
Process ForkPoolWorker-58:
Process ForkPoolWorker-43:
Process ForkPoolWorker-41:
Process ForkPoolWorker-50:
Process ForkPoolWorker-45:
Process ForkPoolWorker-48:
Process ForkPoolWorker-56:
Process ForkPoolWorker-42:
Process ForkPoolWorker-49:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessi

  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
KeyboardInterrupt
KeyboardInterrupt
  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/azarichkovyi/Projects/Mask_RCNN/env/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
K

KeyboardInterrupt: 

In [13]:
predictors = 5

In [14]:
def training(train_indices, val_indices, class_name, params):
    classifier = NbSvmClassifier(**params)
    
    csr = train_word_features.tocsr()
    X_train = csr[train_indices]
    y_train = np.array(train[class_name])[train_indices]
    
    X_test = csr[val_indices]
    y_test = np.array(train[class_name])[val_indices]
    
    classifier.fit(X_train, y_train)
    
    train_proba = classifier.predict_proba(X_train)[:, 1]
    val_proba = classifier.predict_proba(X_test)[:, 1]
    sub_proba = classifier.predict_proba(test_word_features)[:, 1]
    
    train_score = roc_auc_score(y_train, train_proba)
    val_score = roc_auc_score(y_test, val_proba)
    
    return train_score, val_score, val_proba, sub_proba, val_indices

In [15]:
from tqdm import tqdm
import concurrent.futures


cv_params = [
    {'C': 0.7},
    {'C': 0.25},
    {'C': 0.27},
    {'C': 0.25},
    {'C': 0.25},
    {'C': 0.25},
]

scores = []
for i, class_name in enumerate(class_names):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=0xCAFFE)
    
    train_scores, val_scores = [], []
    with concurrent.futures.ProcessPoolExecutor(max_workers=predictors) as executor:
        
        futures = (executor.submit(training, 
                                   train_indices, 
                                   val_indices,
                                   class_name,
                                   cv_params[i]) 
                   for train_indices, val_indices in kf.split(train))
        
        for future in concurrent.futures.as_completed(futures):
            train_score, val_score, val_proba, sub_proba, val_indices = future.result()
            train_scores.append(train_score)
            val_scores.append(val_score)
            
            train_probas[val_indices] += val_proba
            sub_probas += sub_proba / predictors
    
    scores.append(np.mean(val_scores))
    print('\tTrain ROC-AUC: %s' % np.mean(train_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

Class: toxic
	Train ROC-AUC: 0.9976830975650696
	Val ROC-AUC: 0.982910809522215
Class: severe_toxic
	Train ROC-AUC: 0.9989458796377931
	Val ROC-AUC: 0.9895488002729407
Class: obscene
	Train ROC-AUC: 0.9981434954606817
	Val ROC-AUC: 0.9939493040798298
Class: threat
	Train ROC-AUC: 0.9999377061556333
	Val ROC-AUC: 0.9903798946168016
Class: insult
	Train ROC-AUC: 0.9960976821549643
	Val ROC-AUC: 0.9862494420587996
Class: identity_hate
	Train ROC-AUC: 0.999603200406811
	Val ROC-AUC: 0.9843955818430883
Total: 0.9879056387322791


In [18]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,0.726654,1.0,0.049883,0.99956,0.673879
1,0000247867823ef7,0.021689,0.005445,0.015763,0.00135,0.034767,0.005518
2,00013b17ad220c46,0.035929,0.005011,0.022817,0.001455,0.025681,0.00512
3,00017563c3f7919a,0.00605,0.003528,0.011654,0.003329,0.013113,0.003864
4,00017695ad8997eb,0.051394,0.004285,0.020492,0.001537,0.026198,0.003633


In [19]:
submission.to_csv('submission_nb_logistic_regression_101.csv', index=False)
train_submission.to_csv('train_nb_logistic_regression_101.csv', index=False)

## LigthGBM

In [20]:
import lightgbm as lgb

In [21]:
lgm_params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}

rounds_lookup = {'toxic': 140,
             'severe_toxic': 50,
             'obscene': 80,
             'threat': 80,
             'insult': 70,
             'identity_hate': 80}

In [22]:
def training(train_indices, val_indices, class_name, params):   
    csr = train_word_features.tocsr()
    X_train = csr[train_indices]
    y_train = np.array(train[class_name])[train_indices]
    
    X_test = csr[val_indices]
    y_test = np.array(train[class_name])[val_indices]
    
    classifier = lgb.LGBMClassifier(**lgm_params, n_estimators=params)
    classifier.fit(X_train, y_train)
    
    train_proba = classifier.predict_proba(X_train)[:, 1]
    val_proba = classifier.predict_proba(X_test)[:, 1]
    sub_proba = classifier.predict_proba(test_word_features)[:, 1]
    
    train_score = roc_auc_score(y_train, train_proba)
    val_score = roc_auc_score(y_test, val_proba)
    
    return train_score, val_score, val_proba, sub_proba, val_indices

In [23]:
predictors = 5

In [24]:
from tqdm import tqdm
import concurrent.futures

scores = []
for i, class_name in enumerate(class_names):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=0xCAFFE)
    
    train_scores, val_scores = [], []
    with concurrent.futures.ProcessPoolExecutor(max_workers=predictors) as executor:
        
        futures = (executor.submit(training, 
                                   train_indices, 
                                   val_indices,
                                   class_name,
                                   rounds_lookup[class_name]) 
                   for train_indices, val_indices in kf.split(train))
        
        for future in concurrent.futures.as_completed(futures):
            train_score, val_score, val_proba, sub_proba, val_indices = future.result()
            train_scores.append(train_score)
            val_scores.append(val_score)
            
            train_probas[val_indices] += val_proba
            sub_probas += sub_proba / predictors
    
    scores.append(np.mean(val_scores))
    print('\tTrain ROC-AUC: %s' % np.mean(train_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

Class: toxic




	Train ROC-AUC: 0.9965535289890302
	Val ROC-AUC: 0.975885423129507
Class: severe_toxic




	Train ROC-AUC: 0.9985136721289836
	Val ROC-AUC: 0.9855821976733488
Class: obscene




	Train ROC-AUC: 0.9987937346296093
	Val ROC-AUC: 0.9914637584268027
Class: threat




	Train ROC-AUC: 0.9999996816232569
	Val ROC-AUC: 0.982556911129356
Class: insult




	Train ROC-AUC: 0.9947464446080634
	Val ROC-AUC: 0.9813323186388152
Class: identity_hate




	Train ROC-AUC: 0.9998600707550377
	Val ROC-AUC: 0.9777954067633405
Total: 0.9824360026268617


In [25]:
submission.to_csv('submission_lgm_101.csv', index=False)
train_submission.to_csv('train_lgm_101.csv', index=False)