# Classifiy toxicity and then rank toxicity

This kernel is inspired by Rhodium Beng Classifying multi-label comments with Logistic Regression


In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

## Load training and test data

In [2]:
df_train = pd.read_csv("../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv")


In [3]:
df_test = pd.read_csv("../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/test.csv")


In [4]:
cols_target = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']

## Clean up the comment text

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
# clean the comment_text in train_df 
df_train['comment_text'] = df_train['comment_text'].apply(lambda com : clean_text(com))

In [7]:
# clean the comment_text in test_df 
df_test['comment_text'] = df_test['comment_text'].map(lambda com : clean_text(com))


## Define X from entire train & test data for use in tokenization by Vectorizer

In [8]:
X = df_train.comment_text
test_X = df_test.comment_text

In [9]:
print(X.shape, test_X.shape)

(159571,) (153164,)


## Vectorize the data

In [10]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words='english')
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
# examine the document-term matrix created from X_train
X_dtm

<159571x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3178792 stored elements in Compressed Sparse Row format>

In [12]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X)
# examine the document-term matrix from X_test
test_X_dtm

<153164x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2618972 stored elements in Compressed Sparse Row format>

## Classifier Chains - build a multi-label classifier using Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=12.0)

In [14]:
# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [15]:
"""
for label in cols_target:
    print('... Processing {}'.format(label))
    y = df_trainfrom sklearn.multioutput import ClassifierChain[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm,y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))
    # make predictions from test_X
    test_y = logreg.predict(test_X_dtm)
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    #submission_chains[label] = test_y_prob
    # chain current label to X_dtm
    X_dtm = add_feature(X_dtm, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))
    # chain current label predictions to test_X_dtm
    test_X_dtm = add_feature(test_X_dtm, test_y)
    print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))
    
    """

"\nfor label in cols_target:\n    print('... Processing {}'.format(label))\n    y = df_trainfrom sklearn.multioutput import ClassifierChain[label]\n    # train the model using X_dtm & y\n    logreg.fit(X_dtm,y)\n    # compute the training accuracy\n    y_pred_X = logreg.predict(X_dtm)\n    print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))\n    # make predictions from test_X\n    test_y = logreg.predict(test_X_dtm)\n    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]\n    #submission_chains[label] = test_y_prob\n    # chain current label to X_dtm\n    X_dtm = add_feature(X_dtm, y)\n    print('Shape of X_dtm is now {}'.format(X_dtm.shape))\n    # chain current label predictions to test_X_dtm\n    test_X_dtm = add_feature(test_X_dtm, test_y)\n    print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))\n    \n    "

In [16]:
from sklearn.multioutput import ClassifierChain
chain = ClassifierChain(LogisticRegression(C=12.0), order='random', random_state=0)

y = df_train[cols_target]

chain.fit(X_dtm,y)

ClassifierChain(base_estimator=LogisticRegression(C=12.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=0)

In [17]:
# compute the training accuracy
y_pred_X = chain.predict(X_dtm)

In [18]:
accuracy_score(y, y_pred_X)

0.9336721584749108

In [19]:
y_test = pd.read_csv('../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [20]:
df_test = df_test.merge(y_test, on='id', how='left')

In [21]:
df_test2 = df_test[df_test.toxic != -1]

In [22]:
df_test2_dtm = vect.transform(df_test2.comment_text)

In [23]:
# compute the test accuracy
y_pred_test = chain.predict(df_test2_dtm)
accuracy_score(df_test2[cols_target],y_pred_test)

0.8824127043671262

## Apply ranking

In [24]:
validation_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [25]:
validation_data.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


In [26]:
validation_data['less_toxic'] = validation_data.less_toxic.apply(clean_text)

In [27]:
validation_data['more_toxic'] = validation_data.more_toxic.apply(clean_text)

In [28]:
val1_dtm = vect.transform(validation_data.less_toxic)
val2_dtm = vect.transform(validation_data.more_toxic)

In [29]:
val1_pred = chain.predict_proba(val1_dtm)
val2_pred = chain.predict_proba(val2_dtm)

In [30]:
val1_pred_sum = np.sum(val1_pred, axis=1)
val2_pred_sum = np.sum(val2_pred, axis=1)

In [31]:
(val1_pred_sum<val2_pred_sum).sum()/validation_data.shape[0]

0.675036535140162

In [32]:
comments_to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [33]:
comments_to_score['text'] = comments_to_score.text.apply(clean_text)

In [34]:
comments_to_score_dtm = vect.transform(comments_to_score.text)
predictions = chain.predict_proba(comments_to_score_dtm)

In [35]:
pred_sum = np.sum(predictions, axis = 1)

In [36]:
pred_sum.shape

(7537,)

In [37]:
sample_submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')

In [38]:
sample_submission['score'] = pred_sum

In [39]:
sample_submission.to_csv('./submission.csv', index=False)