In [1]:
path=r'C:/Users/lenovo/Desktop/[DM] Group Project/Data-Mining-Project-master/dataset/'

In [2]:
import pandas as pd
import numpy as np

In [67]:
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
labels = pd.read_csv(path+"test_labels.csv")

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
train.isnull().any(),test.isnull().any()

(id               False
 comment_text     False
 toxic            False
 severe_toxic     False
 obscene          False
 threat           False
 insult           False
 identity_hate    False
 dtype: bool, id              False
 comment_text    False
 dtype: bool)

In [6]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [7]:
from keras.preprocessing.text import Tokenizer

In [8]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [9]:
from keras.preprocessing.sequence import pad_sequences

In [10]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [11]:
gl_path = '/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/embeddings/glove.twitter.27B.25d.txt'
ft_path = '/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/embeddings/wiki.simple.vec'
wv_path = '/content/drive/My Drive/Colab_data/Data_Mining/Data-Mining-Project/embeddings/GoogleNews-vectors-negative300.bin'

In [12]:
import gensim.models.keyedvectors as word2vec
import gc

In [32]:
def loadEmbeddingMatrix(typeToLoad):
        #load different embedding file from Kaggle depending on which embedding 
        #matrix we are going to experiment with
        if(typeToLoad=="glove"):
            EMBEDDING_FILE=gl_path
            embed_size = 25
        elif(typeToLoad=="word2vec"):
            word2vecDict = word2vec.KeyedVectors.load_word2vec_format(wv_path, binary=True)
            embed_size = 300
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE=ft_path
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
            f = open(EMBEDDING_FILE)
            for line in f:
                #split up line into an indexed array
                values = line.split()
                #first index is word
                word = values[0]
                #store the rest of the values in the array as a new array
                try:
                  coefs = np.asarray(values[1:], dtype='float32')
                except:
                  continue
                if len(coefs) != 300:
                  continue
                embeddings_index[word] = coefs #50 dimensions
                
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
        else:
            embeddings_index = dict()
            for word in word2vecDict.wv.vocab:
                embeddings_index[word] = word2vecDict.word_vec(word)
            print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(tokenizer.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
        embeddedCount = 0
        for word, i in tokenizer.word_index.items():
            i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
            embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        #finally, return the embedding matrix
        return embedding_matrix

In [33]:
embedding_matrix = loadEmbeddingMatrix('fasttext')

Loaded 110995 word vectors.
total embedded: 59312 common words


In [34]:
embedding_matrix.shape

(221341, 300)

In [36]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional
from keras.models import Model

In [37]:
inp = Input(shape=(maxlen, ))

In [39]:
x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp)
x = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x)



In [40]:
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

In [51]:
import keras.metrics as metrics

In [52]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[
                          metrics.MeanSquaredError(),
                          metrics.AUC(),
                  ])

In [53]:
model.summary()

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          66402300  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 120)          173280    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dropout (Dropout)            (None, 120)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                6050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)              

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_t,y,test_size=0.2,random_state=1) 

In [59]:
batch_size = 128
epochs = 2
hist = model.fit(X_train,y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [63]:
preds_train = model.predict(X_train)

In [64]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

print(roc_auc_score(y_train, preds_train))


0.9749202734628901


In [65]:
preds_val = model.predict(X_val)
print(roc_auc_score(y_val, preds_val))


0.9693278025347044


In [68]:
labels = labels[classes]
sum_labels=np.sum(labels.values,axis=1)
# print(sum_labels)
idx=sum_labels>=0
y_test = labels[idx]
X_test = X_te[idx]

In [69]:
preds_test = model.predict(X_test)

In [70]:
print(roc_auc_score(y_test, preds_test))    #test accuracy for 'fasttext'

0.9597581026908394
