###  Multilabel classification is a problem where multiple target labels can be assigned to each observation instead of only one like in multiclass classification.

In [1]:
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split  
from sklearn.model_selection import  GridSearchCV
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_colwidth', -1)



In [2]:
train_data = pd.read_csv("./train.csv", sep=",")
test_data  = pd.read_csv("./test.csv", sep=",")

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [4]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message."
4,00017695ad8997eb,I don't anonymously edit articles at all.


### Converting labels into an array

In [5]:
labels_ = train_data.iloc[:,2:8].columns.values

In [6]:
y_label = train_data.iloc[:,2:8].values

### Preprocessing Comments

In [8]:
def preprocess_string(x):
    punctuations = '''!()-[]{};:\n'\t"\,<>./?@#+$%^&*_~'''
    no_punct = ""
    for char in x:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct.lower()

In [9]:
preprocess_comment = train_data.comment_text.map(lambda x: preprocess_string(x))

In [10]:
preprocess_comment.head(1)

0    explanationwhy the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827
Name: comment_text, dtype: object

### Splitting into Training and Test Data for Validation

In [11]:
x_train, x_test, y_train, y_test = train_test_split(preprocess_comment.values, y_label, test_size=0.2, random_state=10)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(127656,) (127656, 6)
(31915,) (31915, 6)


### Using GridSearch for tuning the hyper-parameters 

In [12]:
pipeline_rf = Pipeline([
    ('bow', CountVectorizer(analyzer='word',stop_words='english',strip_accents='unicode',token_pattern=r'\w{1,}')),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier(n_jobs=-1))
])


param_rf = {
    'bow__ngram_range':[(1,1),(1,2)],
    'bow__max_df':[0.9],
    'tfidf__norm':['l1','l2'],
    'tfidf__use_idf':[False],
    'classifier__n_estimators':[25]
   }


gridcv_rf = GridSearchCV(
    pipeline_rf, 
    param_grid=param_rf, 
    refit=True, 
    iid=True,
    n_jobs=-1,  
    scoring='roc_auc',
    verbose=1,  
    cv=5  
)

In [13]:
rf_train = gridcv_rf.fit(x_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 69.1min finished


### Analysis

In [14]:
gridcv_rf.best_estimator_

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [15]:
gridcv_rf.best_score_

0.9262861860567244

In [16]:
gridcv_rf.best_params_

{'bow__max_df': 0.9,
 'bow__ngram_range': (1, 1),
 'classifier__n_estimators': 25,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False}

In [17]:
rf_predict = rf_train.predict(x_test)

In [18]:
print(accuracy_score(y_test,rf_predict))

0.916465611781294


In [19]:
print(classification_report(y_test,rf_predict, target_names=labels_))

               precision    recall  f1-score   support

        toxic       0.88      0.59      0.71      3006
 severe_toxic       0.53      0.10      0.17       312
      obscene       0.88      0.64      0.74      1673
       threat       0.55      0.06      0.10       109
       insult       0.77      0.46      0.58      1561
identity_hate       0.69      0.08      0.14       275

  avg / total       0.83      0.53      0.63      6936



### Predicting Output on Test Dataset

In [20]:
test_preprocess_comment = test_data.comment_text.map(lambda x: preprocess_string(x))

In [21]:
test_preprocess_comment.head(1)

0    yo bitch ja rule is more succesful then youll ever be whats up with you and hating you sad mofuckasi should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother toofuckin white boys get things right next time
Name: comment_text, dtype: object

In [22]:
predictions = rf_train.predict(test_preprocess_comment.values)

In [23]:
predictions[0]

array([1., 0., 1., 0., 1., 0.])

In [24]:
for i in range(len(labels_)):
    test_data[labels_[i]] = predictions[:,i]

### Output: Probability for each Toxic Comment

In [25]:
test_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",1.0,0.0,1.0,0.0,1.0,0.0
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.",0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.0,0.0,0.0,0.0,0.0,0.0
5,0001ea8717f6de06,Thank you for understanding. I think very highly of you and would not revert without discussion.,0.0,0.0,0.0,0.0,0.0,0.0
6,00024115d4cbde0f,"Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you. -",0.0,0.0,0.0,0.0,0.0,0.0
7,000247e83dcc1211,:Dear god this site is horrible.,0.0,0.0,0.0,0.0,0.0,0.0
8,00025358d4737918,""" \n Only a fool can believe in such numbers. \n The correct number lies between 10 000 to 15 000. \n Ponder the numbers carefully. \n\n This error will persist for a long time as it continues to reproduce... The latest reproduction I know is from ENCYCLOPÆDIA BRITANNICA ALMANAC 2008 wich states \n Magnittude: 8.7 (fair enough) \n victims: 70 000 (today 10 000 to 15 000 is not """"a lot"""" so I guess people just come out with a number that impresses enough, I don't know. But I know this: it's just a shameless lucky number that they throw in the air. \n GC \n\n """,0.0,0.0,0.0,0.0,0.0,0.0
9,00026d1092fe71cc,"== Double Redirects == \n\n When fixing double redirects, don't just blank the outer one, you need edit it to point it to the final target, unless you think it's inappropriate, in which case, it needs to be nominated at WP:RfD",0.0,0.0,0.0,0.0,0.0,0.0
