# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [None]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./input/train.csv').fillna('Unknown')
test = pd.read_csv('./input/test.csv').fillna('Unknown')

Стадартными подходами для анализа текста являются [Bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) и его модификация [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).

Они реалзованны в `sklearn` в виде [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) и [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

Более подробней про них можно посмотреть [тут](https://github.com/udsclub/workshop/blob/master/notebooks/UDS-workshop-feature-extraction-and-engineering.ipynb)

In [None]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [None]:
import re, string
re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def tokenize(s): 
    return re_tok.sub(r' \1 ', clean_text(s)).split()

In [None]:
# Попробуйте разные Vectorizer и разные размеры n-gramm, стоп-слова, обрезку редких слов, обрезку слишком частых слов
word_vectorizer = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 2),
                                  tokenizer=tokenize,
                                  stop_words='english',
                                  max_df=0.9,
                                  min_df=3,
                                  strip_accents='unicode', 
                                  use_idf=True,
                                  smooth_idf=True, 
                                  sublinear_tf=True,
                                  max_features=300000)

char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  smooth_idf=True,
                                  tokenizer=tokenize,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  max_df=0.9,
                                  min_df=3,
                                  ngram_range=(1, 4),
                                  max_features=300000)

#vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
train_word_features = sparse.hstack([train_char_features, train_word_features])
test_word_features = sparse.hstack([test_char_features, test_word_features])

In [None]:
import pickle
with open('data.pkl', 'wb') as f:
    pickle.dump([train_word_features, test_word_features], f)

In [None]:
import pickle
with open('data.pkl', 'rb') as f:
    train_word_features, test_word_features = pickle.load(f)

Опубликуйте лучшие решение на [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/submit)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=3.15, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, 
                                       dual=self.dual,
                                       class_weight='balanced',
                                       solver='newton-cg', 
                                       max_iter=1000,
                                       tol=0.0001,
                                       n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [None]:
predictors = 10

In [None]:
def training(X_train, X_test, y_train, y_test, i):
    np.random.seed(i)
    ids = np.random.choice(np.arange(len(y_train), dtype=np.int32), 
                           size=int(len(y_train) * 0.8))
    x = X_train[ids]
    y = y_train[ids]
    classifier = NbSvmClassifier(C=0.45)
    classifier.fit(x, y)
    train_score = eval_roc(classifier, X_train, y_train)
    val_score = eval_roc(classifier, X_test, y_test)
    train_proba = classifier.predict_proba(train_word_features)[:, 1]
    proba = classifier.predict_proba(test_word_features)[:, 1]
    return train_score, val_score, train_proba, proba

In [None]:
from tqdm import tqdm
import concurrent.futures

scores = []
for class_name in class_names:
    print('Class: %s' % class_name)
    probas = []
    train_probas = []
    train_target = np.array(train[class_name])
    
    X_train, X_test, y_train, y_test = train_test_split(train_word_features, 
                                                        train_target, 
                                                        test_size=0.2, 
                                                        random_state=0xCAFFE)
                                                        #stratify=train_target)
    
    train_score, val_score = [], []
    with concurrent.futures.ProcessPoolExecutor(max_workers=predictors) as executor:
        futures = (executor.submit(training, X_train, X_test, y_train, y_test, i) for i in range(predictors))
        for future in concurrent.futures.as_completed(futures):
            t_score, v_score, train_proba, proba = future.result()
            train_score.append(t_score)
            val_score.append(v_score)
            train_probas.append(train_proba)
            probas.append(proba)
    
    scores.append(np.mean(val_score))
    print('\tTrain ROC-AUC: %s' % np.mean(train_score))
    print('\tVal ROC-AUC: %s' % np.mean(val_score))
    submission[class_name] = np.mean(probas, axis=0)
    train_submission[class_name] = np.mean(train_probas, axis=0)
print('Total: %s' % np.mean(scores))

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_nb_logistic_regression_010.csv', index=False)
train_submission.to_csv('train_nb_logistic_regression_010.csv', index=False)