In [None]:
import numpy as np
import pandas as pd
import zipfile
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
base_dir = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
with zipfile.ZipFile(base_dir, 'r') as z:
 z.extractall()

In [None]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
tfidf_vec = TfidfVectorizer(max_features=30000, sublinear_tf=True,
                            strip_accents='unicode', analyzer='word',
                            ngram_range=(1, 1), token_pattern=r'\w{1,}',
                            stop_words='english')

comments_vec_train = tfidf_vec.fit_transform(df['comment_text'])

In [None]:
X = comments_vec_train
Y= df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) 

In [None]:
clf1 = LogisticRegression(penalty='l2', random_state=42)
clf2 = LogisticRegression(penalty='l2', random_state=42)
clf3 = LogisticRegression(penalty='l2', random_state=42)
clf4 = LogisticRegression(penalty='l2', random_state=42)
clf5 = LogisticRegression(penalty='l2', random_state=42)
clf6 = LogisticRegression(penalty='l2', random_state=42)

In [None]:
clf1.fit(X_train, y_train.toxic)
clf2.fit(X_train, y_train.severe_toxic)
clf3.fit(X_train, y_train.obscene)
clf4.fit(X_train, y_train.threat)
clf5.fit(X_train, y_train.insult)
clf6.fit(X_train, y_train.identity_hate)

LogisticRegression(random_state=42)

In [None]:
predict_toxic = clf1.predict(X_test)
predict_severe_toxic = clf2.predict(X_test)
predict_obscene = clf3.predict(X_test)
predict_threat = clf4.predict(X_test)
predict_insult = clf5.predict(X_test)
predict_identity_hate = clf6.predict(X_test)

In [None]:
np.array([f1_score(y_test.toxic, predict_toxic),
f1_score(y_test.severe_toxic, predict_severe_toxic),
f1_score(y_test.obscene, predict_obscene),
f1_score(y_test.threat, predict_threat),
f1_score(y_test.insult, predict_insult),
f1_score(y_test.identity_hate, predict_identity_hate)])

array([0.71521739, 0.33608815, 0.73542601, 0.25945946, 0.60942358,
       0.24156306])

In [None]:
c = [0.05 * i for i in range(21)]
k = [0, 0, 0, 0, 0, 0]
max_score = [0, 0, 0, 0, 0, 0]
for i in range(21):
    predict_toxic = (clf1.predict_proba(X_test)[:,1] > c[i])*1
    predict_severe_toxic = (clf2.predict_proba(X_test)[:,1] > c[i])*1
    predict_obscene = (clf3.predict_proba(X_test)[:,1] > c[i])*1
    predict_threat = (clf4.predict_proba(X_test)[:,1] > c[i])*1
    predict_insult = (clf5.predict_proba(X_test)[:,1] > c[i])*1
    predict_identity_hate = (clf6.predict_proba(X_test)[:,1] > c[i])*1
    
    score1 = f1_score(y_test.toxic, predict_toxic)
    score2 = f1_score(y_test.severe_toxic, predict_severe_toxic)
    score3 = f1_score(y_test.obscene, predict_obscene)
    score4 = f1_score(y_test.threat, predict_threat)
    score5 = f1_score(y_test.insult, predict_insult)
    score6 = f1_score(y_test.identity_hate, predict_identity_hate)
    
    
    if score1 > max_score[0]:
        k[0] = c[i]
        max_score[0] = score1
    if score2 > max_score[1]:
        k[1] = c[i]
        max_score[1] = score2
    if score3 > max_score[2]:
        k[2] = c[i]
        max_score[2] = score3
    if score4 > max_score[3]:
        k[3] = c[i]
        max_score[3] = score4
    if score5 > max_score[4]:
        k[4] = c[i]
        max_score[4] = score5
    if score6 > max_score[5]:
        k[5] = c[i]
        max_score[5] = score6

In [None]:
print(k) # порог для каждого класса
print(max_score) # score для каждого класса при соответств. пороге

[0.25, 0.1, 0.15000000000000002, 0.1, 0.15000000000000002, 0.1]
[0.7725462159125727, 0.47467876039304613, 0.8040340810293861, 0.4518518518518518, 0.7134317862165963, 0.44]
