# Homework 2 - TF-IDF Classifier

Цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)


Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



# Input

In [467]:
import numpy as np
import pandas as pd
import eli5

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

# Most popular word

In [315]:
word_vect = TfidfVectorizer( binary= True )
words = word_vect.fit_transform(all_text)
word_vect.get_feature_names()[words.sum(axis =0 ).argmax()]

'the'

# TfidfVectorizer and LogisticRegression

In [395]:
word_vectorizer = TfidfVectorizer( binary= True , lowercase=True , min_df = 3 , ngram_range = (1 , 1))
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
word_vectorizer

TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [396]:
classifier = LogisticRegression(C =2.3 , solver ='sag' , n_jobs =-1  ) 
classifier

LogisticRegression(C=2.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [397]:
scores= []
for class_name in class_names:
    train_target = train[class_name]
    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.972707503712865
CV score for class severe_toxic is 0.9849429551050419
CV score for class obscene is 0.9858540528402363
CV score for class threat is 0.9878726550640627
CV score for class insult is 0.9782607820060188
CV score for class identity_hate is 0.975904393839754
Total score is 0.9809237237613297


# My best solution

### Parameter selection

в ходе эксперимента было выяснено , что в моей модели vectorizer.fit(train_text) дает больше скора на кагл чем vectorizer.fit(all_text)
 vectorizer.fit(train_text) --> на кагл 0.9765 /
 vectorizer.fit(all_text) --> на кагл 0.9761


In [439]:
vectorizer = {}
train_features = {}
test_features = {}
classifier = {}
score = {}

#### toxic

In [440]:
targ = 'toxic'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 3,  lowercase=True , ngram_range= (1,1) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [441]:
targ = 'toxic'
classifier[targ] = LogisticRegression(C =3.5 , solver ='sag' , n_jobs =-1  )

In [442]:
targ = 'toxic'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc'))
print(str(score[targ]) + ' current')

0.9728747742665352 current


#### severe_toxic

In [443]:
targ = 'severe_toxic'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 3,  lowercase=True , ngram_range= (1,2) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [444]:
targ = 'severe_toxic'
classifier[targ] = LogisticRegression(C =2 , solver ='sag' , n_jobs =-1  )

In [445]:
targ = 'severe_toxic'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc'))
print(str(score[targ]) + ' current')

0.9857956177490866 current


#### obscene

In [446]:
targ = 'obscene'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 3,  lowercase=True , ngram_range= (1,1) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [447]:
targ = 'obscene'
classifier[targ] = LogisticRegression(C =2.5 , solver ='sag' , n_jobs =-1  )

In [448]:
targ = 'obscene'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc'))
print(str(score[targ]) + ' current')

0.9858725782679101 current


#### threat

In [449]:
targ = 'threat'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 4,  lowercase=True , ngram_range= (1,2) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [450]:
targ = 'threat'
classifier[targ] = LogisticRegression(C =7 , solver ='sag' , n_jobs =-1  )

In [451]:
targ = 'threat'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc')) 
print(str(score[targ]) + ' current')

0.9891099555926052 current


#### insult

In [452]:
targ = 'insult'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 5,  lowercase=True , ngram_range= (1,1) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [453]:
targ = 'insult'
classifier[targ] = LogisticRegression(C =2 , solver ='sag' , n_jobs =-1  )

In [454]:
targ = 'insult'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc'))
print(str(score[targ]) + ' current')

0.9782512376825944 current


#### identity_hate

In [455]:
targ = 'identity_hate'
vectorizer[targ] = TfidfVectorizer( binary= True , min_df = 3,  lowercase=True , ngram_range= (1,1) )
vectorizer[targ].fit(all_text)
train_features[targ] = vectorizer[targ].transform(train_text)
#test_features[targ] = vectorizer[targ].transform(test_text)

In [456]:
targ = 'identity_hate'
classifier[targ] = LogisticRegression(C =2 , solver ='sag' , n_jobs =-1  )

In [457]:
targ = 'identity_hate'
score[targ] = np.mean(cross_val_score(classifier[targ], train_features[targ], train[targ], scoring='roc_auc'))
print(str(score[targ]) + ' current')

0.9759325888553004 current


### Total score

Max Kaggle score : 0.9765

In [458]:
np.mean([score[el] for el in class_names ])

0.9813061254023387

#### submission

In [438]:
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    test_features[class_name] = vectorizer[class_name].transform(test_text)
    classifier[class_name].fit( train_features[class_name] , train[class_name])
    submission[class_name] = classifier[class_name].predict_proba(test_features[class_name])[:, 1] 
submission.to_csv('submission.csv', index=False)

# Model Analysis

In [459]:
target = 'toxic'
lr = classifier[target].fit(train_features[target] , train[target] )
eli5.explain_weights(lr , top = (100 , 100), feature_names = vectorizer[target].get_feature_names()  )

Weight?,Feature
+25.728,fuck
+21.806,fucking
+17.722,shit
+17.574,stupid
+17.189,idiot
+14.617,bullshit
+13.773,ass
+13.405,suck
+12.903,asshole
+12.117,bitch


In [464]:
textTMP = test_text.sample().iloc[0]
eli5.explain_prediction(lr,textTMP, vec=vectorizer[target], feature_names=vectorizer[target].get_feature_names())

Contribution?,Feature
3.136,<BIAS>
-3.039,Highlighted in text (sum)


---

---

---

# Greedy parameters selection 

TfidfVectorizer( min_df = 1....5  , ngram_range = (1 , 1...2 ))

LogisticRegression(C = 1....10 )

In [233]:
%%time
vectorizer_best_1_1 = []
for i in range(1,11):
    print('runing {} of 20'.format(i))
    tmp =TfidfVectorizer( binary= True , min_df = i ,  lowercase=True)
    tmp.fit(all_text)
    vectorizer_best_1_1.append({'train':tmp.transform(train_text) , 'test' :tmp.transform(test_text)})
vectorizer_best_1_2 = []
for i in range(1,11):
    print('runing {} of 20'.format(i+10))
    tmp =TfidfVectorizer( binary= True , min_df = i ,  lowercase=True , ngram_range = (1 , 2))
    tmp.fit(all_text)
    vectorizer_best_1_2.append({'train':tmp.transform(train_text) , 'test' :tmp.transform(test_text)})

runing 1 of 20
runing 2 of 20
runing 3 of 20
runing 4 of 20
runing 5 of 20
runing 6 of 20
runing 7 of 20
runing 8 of 20
runing 9 of 20
runing 10 of 20
runing 11 of 20
runing 12 of 20
runing 13 of 20
runing 14 of 20
runing 15 of 20
runing 16 of 20
runing 17 of 20
runing 18 of 20
runing 19 of 20
runing 20 of 20
Wall time: 24min 58s


In [246]:
%%time
best = {}
for targ in class_names:
    print(targ)
    maxScore = 0
    for i in range(5):
        train_featuresTMP = vectorizer_best_1_1[i]['train']
        for tmpC in range(1,11):
            classifierTMP = LogisticRegression(C =tmpC , solver ='sag' , n_jobs =-1  )
            score = np.mean(cross_val_score(classifierTMP, train_featuresTMP, train[targ], scoring='roc_auc'))
            str_text = ' 1_1 | min_df : {} | C : {} | score {}'.format(i+1 , tmpC , score)
            print(str_text)
            if score > maxScore:
                maxScore = score
                best[targ] = {'text': str_text , 'vectorizer':vectorizer_best_1_1[i] ,'classifier':classifierTMP , 'score': score }
    maxScore = 0
    for i in range(5):
        train_featuresTMP = vectorizer_best_1_2[i]['train']
        for tmpC in range(1,11):
            classifierTMP = LogisticRegression(C =tmpC , solver ='sag' , n_jobs =-1  )
            score = np.mean(cross_val_score(classifierTMP, train_featuresTMP, train[targ], scoring='roc_auc'))
            str_text = ' 1_2 | min_df : {} | C : {} | score {}'.format(i+1 , tmpC , score)
            print(str_text)
            if score > maxScore:
                maxScore = score
                best[targ+'2'] = {'text': str_text , 'vectorizer':vectorizer_best_1_2[i] ,'classifier':classifierTMP , 'score': score}
    print(' best  '+ targ +' :' + best[targ]['text'])
    print(' best2 '+ targ +' :' + best[targ + '2']['text'])

toxic
 1_1 | min_df : 1 | C : 1 | score 0.9699451966172824
 1_1 | min_df : 1 | C : 2 | score 0.9719658306152249
 1_1 | min_df : 1 | C : 3 | score 0.9724444777955149
 1_1 | min_df : 1 | C : 4 | score 0.9724861533361214
 1_1 | min_df : 1 | C : 5 | score 0.9723599743278483
 1_1 | min_df : 1 | C : 6 | score 0.9721563647125038
 1_1 | min_df : 1 | C : 7 | score 0.9719207098983005
 1_1 | min_df : 1 | C : 8 | score 0.9716700684437893
 1_1 | min_df : 1 | C : 9 | score 0.9714164984194097
 1_1 | min_df : 1 | C : 10 | score 0.9711638107597086
 1_1 | min_df : 2 | C : 1 | score 0.9705129125254569
 1_1 | min_df : 2 | C : 2 | score 0.9724117162882567
 1_1 | min_df : 2 | C : 3 | score 0.9728091995235796
 1_1 | min_df : 2 | C : 4 | score 0.9727850172305511
 1_1 | min_df : 2 | C : 5 | score 0.972600343822549
 1_1 | min_df : 2 | C : 6 | score 0.9723475189510927
 1_1 | min_df : 2 | C : 7 | score 0.9720635978374856
 1_1 | min_df : 2 | C : 8 | score 0.9717673847862317
 1_1 | min_df : 2 | C : 9 | score 0.9714

 1_2 | min_df : 1 | C : 3 | score 0.9854934832375927
 1_2 | min_df : 1 | C : 4 | score 0.9854927031227275
 1_2 | min_df : 1 | C : 5 | score 0.9854609634767207
 1_2 | min_df : 1 | C : 6 | score 0.9854113060367578
 1_2 | min_df : 1 | C : 7 | score 0.9853606478047388
 1_2 | min_df : 1 | C : 8 | score 0.9853059267985097
 1_2 | min_df : 1 | C : 9 | score 0.9852504437410429
 1_2 | min_df : 1 | C : 10 | score 0.9851971042356098
 1_2 | min_df : 2 | C : 1 | score 0.9854193418621403
 1_2 | min_df : 2 | C : 2 | score 0.9856949424239883
 1_2 | min_df : 2 | C : 3 | score 0.9856735785128153
 1_2 | min_df : 2 | C : 4 | score 0.9855637373291297
 1_2 | min_df : 2 | C : 5 | score 0.9854309510920342
 1_2 | min_df : 2 | C : 6 | score 0.9852960564780062
 1_2 | min_df : 2 | C : 7 | score 0.985163087597772
 1_2 | min_df : 2 | C : 8 | score 0.9850414579573051
 1_2 | min_df : 2 | C : 9 | score 0.9849216414931243
 1_2 | min_df : 2 | C : 10 | score 0.9848061826642867
 1_2 | min_df : 3 | C : 1 | score 0.985602387

 1_1 | min_df : 1 | C : 2 | score 0.9873237000147496
 1_1 | min_df : 1 | C : 3 | score 0.9875360221793059
 1_1 | min_df : 1 | C : 4 | score 0.987436227646327
 1_1 | min_df : 1 | C : 5 | score 0.9872396146194223
 1_1 | min_df : 1 | C : 6 | score 0.9870079433830227
 1_1 | min_df : 1 | C : 7 | score 0.9867806085900787
 1_1 | min_df : 1 | C : 8 | score 0.9865593877104911
 1_1 | min_df : 1 | C : 9 | score 0.9863483219310393
 1_1 | min_df : 1 | C : 10 | score 0.9861512253750001
 1_1 | min_df : 2 | C : 1 | score 0.9864091196045809
 1_1 | min_df : 2 | C : 2 | score 0.9877091658830381
 1_1 | min_df : 2 | C : 3 | score 0.9878859581686467
 1_1 | min_df : 2 | C : 4 | score 0.9877458770357689
 1_1 | min_df : 2 | C : 5 | score 0.9875290212605364
 1_1 | min_df : 2 | C : 6 | score 0.9872735543126213
 1_1 | min_df : 2 | C : 7 | score 0.9870238585826564
 1_1 | min_df : 2 | C : 8 | score 0.9867769222054644
 1_1 | min_df : 2 | C : 9 | score 0.9865534124071728
 1_1 | min_df : 2 | C : 10 | score 0.986335872

 1_2 | min_df : 1 | C : 4 | score 0.9767366710581085
 1_2 | min_df : 1 | C : 5 | score 0.9770218739651932
 1_2 | min_df : 1 | C : 6 | score 0.9771865856882362
 1_2 | min_df : 1 | C : 7 | score 0.9772888269058774
 1_2 | min_df : 1 | C : 8 | score 0.977353937052052
 1_2 | min_df : 1 | C : 9 | score 0.9773976105644518
 1_2 | min_df : 1 | C : 10 | score 0.977424387193308
 1_2 | min_df : 2 | C : 1 | score 0.9741660685656837
 1_2 | min_df : 2 | C : 2 | score 0.976544043697503
 1_2 | min_df : 2 | C : 3 | score 0.9773146168384086
 1_2 | min_df : 2 | C : 4 | score 0.9776073810388284
 1_2 | min_df : 2 | C : 5 | score 0.9777039173283079
 1_2 | min_df : 2 | C : 6 | score 0.9777142887357403
 1_2 | min_df : 2 | C : 7 | score 0.9776828216633141
 1_2 | min_df : 2 | C : 8 | score 0.9776277565108632
 1_2 | min_df : 2 | C : 9 | score 0.9775636750240885
 1_2 | min_df : 2 | C : 10 | score 0.9774935076830354
 1_2 | min_df : 3 | C : 1 | score 0.9747641784984613
 1_2 | min_df : 3 | C : 2 | score 0.97693624069

In [261]:
for coment_clas in class_names:
    print(coment_clas)
    print(' '+best[coment_clas]['text'])
    print(' '+best[coment_clas+'2']['text'])
print('final score :' + str(np.array([max( best[coment_clas]['score'], best[coment_clas+'2']['score'] ) for coment_clas in class_names]).mean()) )

toxic
  1_1 | min_df : 3 | C : 3 | score 0.9728753001623875
  1_2 | min_df : 3 | C : 10 | score 0.9726354216119689
severe_toxic
  1_1 | min_df : 3 | C : 1 | score 0.9853974976199646
  1_2 | min_df : 3 | C : 2 | score 0.9857955224926819
obscene
  1_1 | min_df : 3 | C : 3 | score 0.9858564051634269
  1_2 | min_df : 3 | C : 8 | score 0.9846872502487439
threat
  1_1 | min_df : 4 | C : 3 | score 0.9880372501604714
  1_2 | min_df : 4 | C : 7 | score 0.9891102300948376
insult
  1_1 | min_df : 5 | C : 2 | score 0.978251315519493
  1_2 | min_df : 5 | C : 4 | score 0.9778522195042552
identity_hate
  1_1 | min_df : 3 | C : 2 | score 0.9759326156727735
  1_2 | min_df : 3 | C : 6 | score 0.9748431186551486
final score :0.9813035648509333


# The End