In [1]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import pickle

In [11]:
from scipy.sparse import hstack
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split

In [64]:
import tensorflow as tf
from gensim.models import word2vec
import multiprocessing

In [63]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [52]:
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from keras.layers import Dense, Input, LSTM, Embedding, Activation
from keras.layers import concatenate, Activation, Dropout
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D, GlobalMaxPooling1D, BatchNormalization
from keras.models import Model
from keras.optimizers import Adam, Nadam

In [71]:
from keras.callbacks import Callback
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [65]:
from sklearn.neural_network import MLPClassifier

In [2]:
train = pd.read_csv("data/train.csv")
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,"Gaelic translation \n\nHi, don't suppose you c...",0,0,0,0,0,0
1,1,Hey dick \nYou don't know what is copyright so...,1,0,1,0,1,0
2,2,"""\n\nAm I correct in thinking that you are ref...",0,0,0,0,0,0
3,3,== Notable Alumni == \n\n Why was this section...,0,0,0,0,0,0
4,4,I have already been sent this message about va...,1,0,1,0,1,0
5,5,== exuse me == \n\n i was not attacking someon...,0,0,0,0,0,0
6,6,":You're welcome, Vernon39! Thanks for your imp...",0,0,0,0,0,0
7,7,"Good point by JonC , legally the name of the s...",0,0,0,0,0,0
8,8,"""\nNo such """"compromise"""" was reached on Septe...",0,0,0,0,0,0
9,9,":::Again, we can't take the unverifiable claim...",0,0,0,0,0,0


In [3]:
test = pd.read_csv("data/test.csv")
test.head(10)

Unnamed: 0,id,comment_text
0,150000,Dmacks consitatnly edits other peoples pages a...
1,150001,== Contested deletion == \n\n This article sho...
2,150002,March 2008\n Please stop. If you continue to v...
3,150003,"""}\n\nAugust Esperanza Newsletter\n{| style=""""..."
4,150004,"William Sledd\n\nOK, it is time We Tubers had ..."
5,150005,"""\n\n Please apologize: you made a clear mista..."
6,150006,What makes and egg crack? \n FORCE!!!!!!!!! \n...
7,150007,""" \n :::::No, your comparison is invalid. Your..."
8,150008,"The Tree in a Test Tube, 1942 (full).ogv|The T..."
9,150009,:::I should have listened to your advice. Alan...


In [4]:
train_text = train['comment_text']
test_text = test['comment_text']

all_text = pd.concat([train_text, test_text])

In [5]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [6]:
y = train[target_labels]

### Preprocessing

In [7]:
def cleanImages(sentence):
    cleanr = re.compile(r'\w+\.(jpg|png)')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

In [8]:
def cleanLinks(sentence):
    cleanr = re.compile('((www\.[^\s]+)|(https?://[^\s]+))')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

In [9]:
def cleanHtml(sentence):
    cleanr = re.compile(r'\|(.*?)\r\n')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

In [10]:
def full_words(sentence):
    repl = {
        r'(I|i)\'m': 'i am',
        r'(\w+)\'re': '\g<1> are',
        r'(\w+)\'d': '\g<1> would',
        r'\bwon\'t\b': 'will not',
        r'(\w+)n\'t': '\g<1> not',
        r'\bcannot\b': 'can not',
        r'(\w+)\'ll': '\g<1> will',
        r'(\w+)\'s': '\g<1> is'
    }

    cleaned = str(sentence)
    for i in repl.keys():
        cleanr = re.compile(i)
        cleaned = re.sub(cleanr, repl[i], cleaned)
    return cleaned

In [11]:
def cleanStopWords(sentence):
    stop_words = stopwords.words('english')
    
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    cleaned = re_stop_words.sub(" ", str(sentence))
    return cleaned

In [12]:
def cleanPunc(sentence):
    to_exclude = string.punctuation + "–" + string.digits + "—" + "•"
    cleaned = re.sub('[%s]' % re.escape(to_exclude), '', str(sentence))
    return cleaned

In [13]:
def cleanSpaces(sentence):
    clean = sentence.replace('\n', ' ')
    clean = clean.replace('\t', ' ')
    clean = clean.replace('\r', ' ')
    
    clean = re.sub('\s+', ' ', clean)
    clean = re.sub('\s+$', '', clean)
        
    return clean

In [14]:
def cleanText(text):
    text = text.str.lower()
    text = text.apply(cleanHtml)
    text = text.apply(cleanLinks)
    text = text.apply(cleanImages)
    text = text.apply(full_words)
    text = text.apply(cleanStopWords)
    text = text.apply(cleanPunc)
    cleaned_text = text.apply(cleanSpaces)
    
    return cleaned_text  

In [15]:
all_text = cleanText(all_text)

In [7]:
def text_to_words(sentence):    
    words = word_tokenize(sentence)
    return words

In [17]:
tokenized_comments = all_text.apply(text_to_words)

### Приведение слов к начальной форме

In [18]:
def posTagging(sentence):
    tagged = nltk.pos_tag(sentence)
    return tagged

In [19]:
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

In [20]:
def lemmatizeComment(sentence): 
    wordnet_lemmatizer = WordNetLemmatizer()

    words = []
    for word in sentence:
        if is_adjective(word[1]):
            words.append(wordnet_lemmatizer.lemmatize(word[0], pos=wn.ADJ))
        elif is_noun(word[1]):
            words.append(wordnet_lemmatizer.lemmatize(word[0], pos=wn.NOUN))
        elif is_adverb(word[1]):
            words.append(wordnet_lemmatizer.lemmatize(word[0], pos=wn.ADV))
        elif is_verb(word[1]):
            words.append(wordnet_lemmatizer.lemmatize(word[0], pos=wn.VERB))
    return words      

In [21]:
def lemmatize(text):
    tagged_text = text.apply(posTagging)
    lemmatized = tagged_text.apply(lemmatizeComment)
    
    return lemmatized

In [22]:
tokenized_comments = lemmatize(tokenized_comments)

In [24]:
all_text = [" ".join(comment) for comment in tokenized_comments]

In [26]:
clean_train_text = all_text[:len(train_text)]
clean_test_text = all_text[len(train_text):]

In [123]:
train['clean_comment_text'] = pd.Series(clean_train_text, index=train.index)
train.to_csv('clean_train.csv', index=False)

In [124]:
test['clean_comment_text'] = pd.Series(clean_test_text, index=test.index)
test.to_csv('clean_test.csv', index=False)

### Load clean comments

In [7]:
train_text = pd.read_csv('clean_train.csv')
train_text['clean_comment_text'] = train_text['clean_comment_text'].astype(str)

test_text = pd.read_csv('clean_test.csv')
test_text['clean_comment_text'] = test_text['clean_comment_text'].astype(str)

In [8]:
clean_train_text = train_text['clean_comment_text'].values
clean_test_text = test_text['clean_comment_text'].values

In [9]:
clean_train_text

array(['gaelic translation hi suppose could help gaelic translation thanks advance able',
       'hey dick know copyright shutup get write stupid bullshit talk page dhivehi language dhivehi language bullshit nonsese',
       'correct think refer shatter vessel first discuss arizal mention might possible expand',
       ...,
       'see detail previous battle firs obviously seriously difficult history revenge block talk contribution',
       'good idea sure great article end',
       'princess irene duchess parma husband carlo hugo duke parma duchess parma article even mention title'],
      dtype=object)

### TfidfVectorization

In [10]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)
train_word_features = word_vectorizer.fit_transform(clean_train_text)
print('Word TFIDF 1/2')
test_word_features = word_vectorizer.transform(clean_test_text)
print('Word TFIDF 2/2')

Word TFIDF 1/2
Word TFIDF 2/2


In [23]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 6),
    max_features=30000)
char_vectorizer.fit(clean_train_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [24]:
train_char_features = char_vectorizer.transform(clean_train_text)
test_char_features = char_vectorizer.transform(clean_test_text)

In [27]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [44]:
sparse.save_npz('train_features.npz', train_features)

In [45]:
sparse.save_npz('test_features.npz', test_features)

In [None]:
# del train_word_features
# del train_char_features

# del test_word_features
# del test_char_features

In [13]:
train_features = sparse.load_npz('train_features.npz')
test_features = sparse.load_npz('test_features.npz')

### Logistic regression

In [31]:
models = []
avg_auc = 0
for class_ in target_labels:
    print(class_)
    logreg = LogisticRegression(solver='sag', penalty='l2')
    logreg.fit(train_features, y[class_])
    models.append(logreg)
    prediction = logreg.predict(train_features)
    actual = y[class_]
    AUC = roc_auc_score(actual,prediction)
    print(AUC)
    avg_auc = avg_auc + AUC

toxic
0.837216851023329
severe_toxic
0.6637535198426912
obscene
0.8604099854334917
threat
0.608337454571054
insult
0.8124256959914072
identity_hate
0.6903914701382333


In [32]:
print('Mean AUC = {}'.format(np.round(float(avg_auc)/6.0,2)))

Mean AUC = 0.75


In [15]:
modelsTrees = []
losses = []
for i, label in enumerate(target_labels):
    print(label)
    train_target = y[label]
    classifier = ExtraTreesClassifier(n_estimators=30)
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(label, cv_loss))
    classifier.fit(train_features, train_target)
    modelsTrees.append(classifier)

toxic
CV score for class toxic is 0.9400923548136658
severe_toxic
CV score for class severe_toxic is 0.9291451986192852
obscene
CV score for class obscene is 0.969619027585356
threat
CV score for class threat is 0.8374413089538898
insult
CV score for class insult is 0.9505633646203475
identity_hate
CV score for class identity_hate is 0.9112287530264432


In [18]:
for i, label in enumerate(target_labels):
    with open('models/trees/model_' + label + '.pkl', 'wb') as f:
                pickle.dump(modelsTrees[i], f)

In [29]:
sub_ids = pd.DataFrame.from_dict({'id': test['id']})
preds = np.zeros((len(test), len(target_labels)))
submission = pd.concat([sub_ids, pd.DataFrame(preds, columns = target_labels)], axis=1)

In [30]:
for i, label in enumerate(target_labels):
    with open('models/trees/model_' + label + '.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    
    submission[label] = loaded_model.predict_proba(test_features)[:,1]

In [32]:
submission.to_csv('submission_2.csv', index=False)

### Classifying LSA vectors

In [33]:
lsa = TruncatedSVD(n_components=1000)
X_train_lsa = lsa.fit_transform(train_features)

In [34]:
X_test_lsa = lsa.transform(test_features)

In [35]:
modelsLgR = []
avg_auc = 0
for class_ in target_labels:
    print(class_)
    logreg = LogisticRegression(solver='sag', penalty='l2')
    logreg.fit(X_train_lsa, y[class_])
    modelsLgR.append(logreg)
    prediction = logreg.predict(X_train_lsa)
    actual = y[class_]
    AUC = roc_auc_score(actual,prediction)
    print(AUC)
    avg_auc = avg_auc + AUC

toxic
0.796306872495886
severe_toxic
0.6180355086776136
obscene
0.8292850122084928
threat
0.5724944781281074
insult
0.7708518451682986
identity_hate
0.6493205330488676


### Word2vec

In [39]:
num_features = 200
min_word_count = 1
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-3
seed = 1

In [40]:
word2vec_ = word2vec.Word2Vec(
    sg = 1, seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
)

In [379]:
tokenized_train_text = tokenized_comments[:len(train_text)]
tokenized_test_text = tokenized_comments[len(train_text):]

In [41]:
#если загрузили очищенные комменты tokenized_train_text/tokenized_test_text не определены, поэтому
# tokenized_train_text = train_text['clean_comment_text'].apply(text_to_words)
# tokenized_test_text = test_text['clean_comment_text'].apply(text_to_words)

In [42]:
word2vec_.build_vocab(tokenized_train_text.values)

In [43]:
len(word2vec_.wv.vocab.items())

211442

In [383]:
word2vec_.train(tokenized_train_text, total_examples = word2vec_.corpus_count, epochs=100, compute_loss=True)

(461178910, 483992400)

In [511]:
word2vec_.save("comments_model.w2v")

### Load word2vec

In [11]:
w2v = word2vec.Word2Vec.load('comments_model.w2v')

In [14]:
tokenizer = Tokenizer(num_words=None)

tokenizer.fit_on_texts(clean_train_text)
tokenized_train = tokenizer.texts_to_sequences(clean_train_text)
tokenized_test = tokenizer.texts_to_sequences(clean_test_text)
word_index = tokenizer.word_index

In [15]:
NUM = len(word_index)
print('Vocab size: {}'.format(NUM))
longest = max(len(seq) for seq in tokenized_train)
print("Longest comment size: {}".format(longest))
average = np.mean([len(seq) for seq in tokenized_train])
print("Average comment size: {}".format(average))
stdev = np.std([len(seq) for seq in tokenized_train])
print("Stdev of comment size: {}".format(stdev))
SENTENCE_LENGTH = int(average + stdev * 3)
print('Max comment size: {}'.format(SENTENCE_LENGTH))

Vocab size: 211442
Longest comment size: 1250
Average comment size: 32.231946666666666
Stdev of comment size: 52.24658164968456
Max comment size: 188


In [16]:
processed_X_train = pad_sequences(tokenized_train, maxlen=SENTENCE_LENGTH, padding='post', truncating='post')
processed_X_test = pad_sequences(tokenized_test, maxlen=SENTENCE_LENGTH, padding='post', truncating='post')

### Embedding

In [17]:
DIM = w2v.vector_size 

embedding_matrix = np.zeros((NUM+1, DIM))
for word, i in tokenizer.word_index.items():
    if i >= NUM:
        break
    if word in w2v.wv.vocab.keys():
        embedding_matrix[i] = w2v.wv[word]

### RNN model

In [18]:
model = Sequential()

model.add(Embedding(NUM+1, DIM, weights=[embedding_matrix], input_length=SENTENCE_LENGTH, trainable=True))

model.add(LSTM(60, return_sequences=True, name='lstm_layer'))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(3))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())

model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='sigmoid'))

In [21]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [60]:
class RocAucEvaluation(Callback):
    def __init__(self, filepath, validation_data=(), interval=1, max_epoch = 100):
        super(Callback, self).__init__()
        print("After init")
        self.interval = interval
        self.filepath = filepath
        self.stopped_epoch = max_epoch
        self.best = 0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)

    def on_epoch_end(self, epoch, logs={}):
        print("Epoch end 1")
        if epoch % self.interval == 0:
            y_pred = self.model.predict_proba(self.X_val, verbose=0)
            current = roc_auc_score(self.y_val, y_pred)
            logs['roc_auc_val'] = current

            if current > self.best:
                print(" - AUC - improved from {:.5f} to {:.5f}".format(self.best, current))
                self.best = current
                self.y_pred = y_pred
                self.stopped_epoch = epoch+1
                self.model.save(self.filepath, overwrite=True)
            else:
                print(" - AUC - did not improve")

In [26]:
opt = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy', auc])

early_stop = EarlyStopping(monitor="roc_auc_val", mode="max", patience=2)

x_train, x_test, y_train, y_test = train_test_split(processed_X_train, y, test_size = 0.03, random_state = 144)

rocAuc = RocAucEvaluation(filepath='models/lstm/model.best.hdf5', validation_data=(x_test, y_test), interval=1)
hist_adam = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_data=(x_test, y_test),
         callbacks=[rocAuc, early_stop])

After init
Train on 145500 samples, validate on 4500 samples
Epoch 1/1
Epoch end 1
 - AUC - improved from 0.00000 to 0.97330


In [31]:
predictions = model.predict(processed_X_test, verbose=0)

In [32]:
sub_ids = pd.DataFrame.from_dict({'id': test['id']})
preds = np.zeros((len(test), len(target_labels)))
submission = pd.concat([sub_ids, pd.DataFrame(preds, columns = target_labels)], axis=1)

In [35]:
for i in range(0, 6):
    submission[target_labels[i]] = predictions[:,i]

In [36]:
submission.to_csv('submission_3.csv', index=False)

### CNN model

In [38]:
model = Sequential()

model.add(Embedding(NUM + 1, DIM, weights=[embedding_matrix], input_length=SENTENCE_LENGTH, trainable=True))

model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(3))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())

model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='sigmoid'))

In [39]:
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', auc])

rocAuc = RocAucEvaluation(filepath='models/cnn/model.best.hdf5', validation_data=(x_test, y_test), interval=1)
hist_adam = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_data=(x_test, y_test),
         callbacks=[rocAuc, early_stop])

After init
Train on 145500 samples, validate on 4500 samples
Epoch 1/1
Epoch end 1
 - AUC - improved from 0.00000 to 0.97837


In [41]:
predictions = model.predict(processed_X_test, verbose=0)

In [42]:
sub_ids = pd.DataFrame.from_dict({'id': test['id']})
preds = np.zeros((len(test), len(target_labels)))
submission = pd.concat([sub_ids, pd.DataFrame(preds, columns = target_labels)], axis=1)

In [43]:
for i in range(0, 6):
    submission[target_labels[i]] = predictions[:,i]

In [44]:
submission.to_csv('submission_4.csv', index=False)

### Get features

In [34]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [35]:
def getAvgFeatureVecs(reviews, model, num_features):
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [36]:
tokenized_train_text = tokenized_comments[:len(train_text)]
tokenized_test_text = tokenized_comments[len(train_text):]

In [37]:
num_features = 200

In [38]:
f_matrix_train = getAvgFeatureVecs(tokenized_train_text, w2v, num_features)

  app.launch_new_instance()


In [60]:
matrix_train = pd.DataFrame(f_matrix_train)
matrix_train.to_csv('f_matrix_train.csv', index = False)

In [84]:
f_matrix_test = getAvgFeatureVecs(tokenized_test_text, w2v, num_features)

  app.launch_new_instance()


In [85]:
matrix_test = pd.DataFrame(f_matrix_test)
matrix_test.to_csv('f_matrix_test.csv', index = False)

### MLP classification

In [144]:
# f_matrix_train = pd.read_csv('f_matrix_train.csv')

In [145]:
# f_matrix_train = np.array(f_matrix_train)

In [87]:
modelsMLP = []
for i in range(0, 6):
    m = MLPClassifier(solver='adam', hidden_layer_sizes=(30,30,30), random_state=1)
    modelsMLP.append(m)
print(modelsMLP)

[MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
   

In [92]:
y_true = y[0]
print('...Processing {}'.format(target_labels[0]))
modelsMLP[0].fit(f_matrix_train, y_true)

...Processing toxic




MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [93]:
predictions = modelsMLP[0].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9288817567184795


In [95]:
import pickle

In [96]:
with open('models/mlp/model_' + target_labels[0] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[0], f)

In [140]:
y_true = y[1]
print('...Processing {}'.format(target_labels[1]))
modelsMLP[1].fit(f_matrix_train, y_true)

...Processing severe_toxic


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [141]:
with open('models/mlp/model_' + target_labels[1] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[1], f)

In [142]:
predictions = modelsMLP[1].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9448461525625697


In [143]:
submission[target_labels[1]] = modelsMLP[1].predict_proba(f_matrix_test)[:,1]

In [69]:
y_true = y[2]
print('...Processing {}'.format(target_labels[2]))
modelsMLP[2].fit(f_matrix_train, y_true)

...Processing obscene


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [72]:
with open('models/mlp/model_' + target_labels[2] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[2], f)

In [73]:
predictions = modelsMLP[2].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9748833856975384


In [74]:
y_true = y[3]
print('...Processing {}'.format(target_labels[3]))
modelsMLP[3].fit(f_matrix_train, y_true)

with open('models/mlp/model_' + target_labels[3] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[3], f)

...Processing threat


In [75]:
predictions = modelsMLP[3].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9872981986817906


In [76]:
y_true = y[4]
print('...Processing {}'.format(target_labels[4]))
modelsMLP[4].fit(f_matrix_train, y_true)

with open('models/mlp/model_' + target_labels[4] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[4], f)

...Processing insult


In [77]:
predictions = modelsMLP[4].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9396339576048516


In [78]:
y_true = y[5]
print('...Processing {}'.format(target_labels[5]))
modelsMLP[5].fit(f_matrix_train, y_true)

with open('models/mlp/model_' + target_labels[5] + '.pkl', 'wb') as f:
                pickle.dump(modelsMLP[5], f)

...Processing identity_hate


In [79]:
predictions = modelsMLP[5].predict(f_matrix_train)
print("Accuracy = ", roc_auc_score(y_true, predictions))

Accuracy =  0.9846426212979155


In [106]:
sub_ids = pd.DataFrame.from_dict({'id': test['id']})
preds = np.zeros((len(test), len(target_labels)))
submission = pd.concat([sub_ids, pd.DataFrame(preds, columns = target_labels)], axis=1)

In [108]:
for i in range(0, 6):
    with open('models/mlp/model_' + target_labels[i] + '.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    
    submission[target_labels[i]] = loaded_model.predict_proba(f_matrix_test)[:,1]

In [112]:
submission.to_csv('submission.csv', index=False)

In [113]:
sub = pd.read_csv("submission.csv")
sub.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,150000,0.964798,8.90759e-16,0.02412191,5.291121e-32,0.5717639,1.400704e-23
1,150001,6e-06,2.531597e-20,6.22066e-14,1.604982e-11,2.036368e-06,1.424178e-16
2,150002,2.8e-05,3.566068e-15,1.639609e-27,1.121994e-12,5.423424e-09,1.0993600000000001e-17
3,150003,0.002818,8.993397e-16,1.203725e-08,1.251981e-28,4.773208e-11,1.9767810000000002e-17
4,150004,0.007929,1.058188e-18,3.846464e-06,1.821899e-23,3.793919e-07,3.710725e-15
5,150005,0.002642,2.398821e-21,9.33511e-09,3.107786e-36,6.483433e-06,1.569068e-15
6,150006,0.997307,0.0001804892,0.7921221,2.240715e-10,0.6105005,1.196878e-08
7,150007,0.202951,2.663867e-08,0.007247397,2.643507e-14,0.005291487,3.543002e-10
8,150008,4.4e-05,2.9128060000000003e-33,2.602724e-08,5.8343169999999995e-30,8.980805000000001e-23,1.927979e-47
9,150009,0.049743,1.600293e-10,1.914783e-07,3.821856e-10,0.0003654804,5.203025e-18
