# Classifiy toxicity and then rank toxicity

This kernel is inspired by Rhodium Beng Classifying multi-label comments with Logistic Regression made during the contest Jigsaw classification challenge.

The idea is to first make a classifier predicting the different toxicity and then use the predictions to make a score of toxicity.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import string
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
stop = stopwords.words('english')
punctuation = string.punctuation

## Load training and test data

In [None]:
# training data
df_train = pd.read_csv(
    "../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv")

In [None]:
# test data
df_test = pd.read_csv(
    "../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/test.csv")

In [None]:
# targets columns
cols_target = ['toxic', 'obscene', 'insult',
               'threat', 'severe_toxic', 'identity_hate']

## Clean up the comment text

In [None]:
# function copied from Rhodium Beng
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# clean the comment_text in train
df_train['comment_text'] = df_train['comment_text'].apply(
    lambda com: clean_text(com))

In [None]:
# clean the comment_text in test
df_test['comment_text'] = df_test['comment_text'].map(
    lambda com: clean_text(com))

In [None]:
def remove_stopwords_sentence(sentence):
    return pd.Series([word for word in sentence[0].split() if word not in stop])


def remove_stopwords_df(df):
    return df.apply(remove_stopwords_sentence, axis=1)


def stem_sentence(s):
    stemmer = EnglishStemmer()
    return pd.Series([stemmer.stem(w) for w in s if not pd.isna(w)]).to_frame().apply(' '.join, axis=0)[0]

In [None]:
# pipeline of preparation
prep_pipeline = Pipeline(steps=[
    ('remove \n', FunctionTransformer(pd.DataFrame.replace, kw_args={
     'to_replace': '\n', 'value': ' ', 'regex': True}, validate=False)),
    ('remove numbers', FunctionTransformer(pd.DataFrame.replace, kw_args={
     'to_replace': '\d', 'value': '', 'regex': True}, validate=False)),
    ('remove html tags', FunctionTransformer(pd.DataFrame.replace,
                                             kw_args={'to_replace': '<.*?>', 'value': '', 'regex': True}, validate=False)),
    ('lower', FunctionTransformer(lambda x: x.squeeze(
        axis=1).str.lower().to_frame(), validate=False)),
    ('remove punctuation', FunctionTransformer(lambda x: x.squeeze(
        axis=1).str.replace('[{}]'.format(punctuation), '').to_frame(), validate=False)),
    ('remove stopwords', FunctionTransformer(remove_stopwords_df, validate=False)),
    ('stemming', FunctionTransformer(pd.DataFrame.apply, kw_args={
     'func': stem_sentence, 'axis': 1}, validate=False)),
    #('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('vectorizer', TfidfVectorizer(lowercase=False, analyzer='word', ngram_range=(1, 3), min_df=5,
                                   preprocessor=None, tokenizer=lambda i: str(i).split()))
])

## Vectorize the data

In [None]:
# create X
X = df_train.comment_text
test_X = df_test.comment_text

In [None]:
print(X.shape, test_X.shape)

In [None]:
# create y
y = df_train[cols_target]

In [None]:
# TfidfVectorizer
vect = TfidfVectorizer(max_features=5000, stop_words='english')
vect

In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)

In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X)

## Classifier Chains

In [None]:
logreg = LogisticRegression(C=12.0)

In [None]:
# chain = ClassifierChain(LogisticRegression(), order='random', random_state=0)

# chain.fit(X_dtm,y)

In [None]:
# parameters for the Grid Search
params = {'base_estimator__C': [0.5, 1, 2], 'base_estimator__penalty': [
    'l1', 'l2'], 'order': [None, 'random']}

# create the grid
grid = GridSearchCV(estimator=ClassifierChain(LogisticRegression(), order='random', random_state=0),
                    param_grid=params, scoring='accuracy', cv=5)

# compute the grid search
grid.fit(X_dtm,y)

In [None]:
grid.best_estimator_

In [None]:
grid.cv_results_

Best parameters : C=1, regu=l1, order=None

In [None]:
# compute the training accuracy
y_pred_X = chain.predict(X_dtm)

In [None]:
accuracy_score(y, y_pred_X)

accuracy train 0.923

In [None]:
y_test = pd.read_csv(
    '../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [None]:
df_test = df_test.merge(y_test, on='id', how='left')

In [None]:
# get only test data with labels
df_test2 = df_test[df_test.toxic != -1]

In [None]:
df_test2_dtm = vect.transform(df_test2.comment_text)

In [None]:
# compute the test accuracy
y_pred_test = chain.predict(df_test2_dtm)
accuracy_score(df_test2[cols_target], y_pred_test)

accuracy test 0.92