In [1]:
import sys
from zipfile import ZipFile
zipfile = '../input/keras-self/repository/CyberZHG-keras-self-attention-c66034d/'
sys.path.insert(0, zipfile)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.layers import Dense, Input, Flatten, SpatialDropout1D, CuDNNLSTM, Bidirectional, Dropout, Embedding, Activation, Reshape, RepeatVector, LSTM, Bidirectional
from keras.models import Model, Sequential, load_model, save_model
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras_self_attention import SeqSelfAttention


Using TensorFlow backend.


In [3]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
embedding_file = r'../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [5]:
punct_mapping = {"_":" ", "`":" "}
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])    
    for p in punct:
        text = text.replace(p, ' ')     
    return text
test['comment_text'] = test['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
train['comment_text'] = train['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [6]:
stop_words = set(stopwords.words('english')) 
new_stop_words=set(stop_words)

for s in stop_words:
    new_stop_words.add(s.replace('\'',''))
    pass

stop_words=new_stop_words
print("Excluding stopwords ...")
base_filters='\n\t!"#$%&()*+,-./:;<=>?[\]^_`{|}~ '
def removeStopWords(comment):
   return ' '.join([word.lower() for word in comment.split() if word not in stop_words])
train['comment_text'] = train['comment_text'].apply(removeStopWords)
test['comment_text'] = test['comment_text'].apply(removeStopWords)

Excluding stopwords ...


In [7]:
import gc
gc.collect()
train['comment_text'].head()

0    this cool it like would want mother read reall...
1    thank this would make life lot less anxiety in...
2    this urgent design problem kudos taking very i...
3      is something i able install site when releasing
4                               haha guys bunch losers
Name: comment_text, dtype: object

In [8]:

lemmatizer = nltk.WordNetLemmatizer()

def lemmatize(text):
  return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

train['comment_text'] = train['comment_text'].apply(lemmatize)
test['comment_text'] = test['comment_text'].apply(lemmatize)

In [9]:
test.head()

Unnamed: 0,id,comment_text
0,7000000,jeff session another one trump orwellian choic...
1,7000001,i actually inspected infrastructure grand chie...
2,7000002,no that wishful thinking democrat fault for 10...
3,7000003,instead wringing hand nibbling periphery issue...
4,7000004,many commenters garbage piled high yard bald t...


In [10]:
train['comment_text'] = train['comment_text'].replace(np.nan, '')
test['comment_text'] = test['comment_text'].replace(np.nan, '')
train['comment_text'] = train['comment_text'].dropna(axis=0)
test['comment_text'] = test['comment_text'].dropna(axis=0)
x_train = train['comment_text'].astype(str)
x_test = test['comment_text'].astype(str)
y_rtain = train['target']

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train['comment_text']) + list(test['comment_text']))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [12]:
x_train = pad_sequences(x_train, maxlen=50, padding='post')
x_test = pad_sequences(x_test, maxlen=50, padding='post')

In [13]:
def load_embedding_matrix(path, word_index):
  embeddings_index = {}
  import os
  f = open(path, 'r', encoding="utf-8")
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, 'f', sep=' ')
    embeddings_index[word] = coefs
  f.close()
embedding_matrix = load_embedding_matrix(embedding_file, tokenizer.word_index)

In [14]:
max_num_words = len(tokenizer.word_index)
def getModel(embedding_matrix):
    max_length = 50
    model = Sequential()
    model.add(Embedding(max_num_words, 300, input_length=max_length))
    model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [15]:
model = getModel(embedding_matrix)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           92542500  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 256)           439296    
_________________________________________________________________
seq_self_attention_1 (SeqSel (None, 50, 256)           16449     
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 12801     
Total params: 93,011,046
Trainable params: 93,011,046
Non-trainable params: 0
_________

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_rtain, test_size=0.24)

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
def convertToClassification(target):
    if target>0.5:
        target = 1
    else:
        target = 0
    return target
Y_train = Y_train.apply(convertToClassification)
Y_test = Y_test.apply(convertToClassification)
model.fit(X_train, Y_train, epochs=2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/2
 173312/1371704 [==>...........................] - ETA: 2:07:57 - loss: 0.1300 - acc: 0.9574

In [19]:
predictions = model.predict(x_test, verbose=1)
predictions = pd.DataFrame(predictions, columns=['prediction'])
submission = pd.DataFrame([test['id'], predictions], columns = ['id', 'prediction'])
submission.to_csv('submission.csv', index=False)



ValueError: all the input array dimensions except for the concatenation axis must match exactly