In [16]:
# Some imports, we are not gong to use all the imports in this workbook but in subsequent workbooks we surely will.
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping


# from keras.layers import *
# from keras.models import *
# from keras import initializers, regularizers, constraints, optimizers, layers
# from keras.initializers import *
# from keras.optimizers import *
# import keras.backend as K
# from keras.callbacks import *
# import tensorflow as tf
# import os
# import time
# import gc
# import re
# import glob

In [2]:
hasoc_train = pd.read_csv('../input/subjectivity-mining/hasoc-train.csv', index_col=0)
olid_test = pd.read_csv('../input/subjectivity-mining/olid-test.csv', index_col=0)
olid_train_small = pd.read_csv('../input/subjectivity-mining/olid-train-small.csv', index_col=0)

### Data Preprocessing

In [3]:
# Define some Global Variables
max_features = 10000 # Maximum Number of words we want to include in our dictionary
maxlen = 60 # No of words in question we want to create a sequence with
embed_size = 300# Size of word to vec embedding we are using

In [4]:
# Some preprocesssing that will be common to all the text classification methods you will see. 
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

# Loading the data
def preproc(train_df, test_df):
    
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    train_df["text"] = train_df["text"].apply(lambda x: clean_text(x))
    test_df["text"] = test_df["text"].apply(lambda x: clean_text(x))
    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.1)

    ## fill up the missing values
    train_X = train_df["text"].fillna("_##_").values
    val_X = val_df["text"].fillna("_##_").values
    test_X = test_df["text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences. We need to pad the sequence with 0's to achieve consistent length across examples.
    '''
    We had train_X = [[1,2,4,3],[1,2,5,6,3]]
    lets say maxlen=6
        We will then get 
        train_X = [[1,2,4,3,0,0],[1,2,5,6,3,0]]
    '''
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['labels'].values
    val_y = val_df['labels'].values  
    
    #shuffling the data
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))

    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    
    
    return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index

In [5]:
# Word 2 vec Embedding

def create_embeddings(word_index):
    '''We want to create an embedding matrix in which we keep only the word2vec for words which are in our word_index
    '''
    EMBEDDING_FILE = '../input/glove6b/glove.6B.300d.txt'

    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix[:max_features]

In [6]:
# https://www.kaggle.com/yekenot/2dcnn-textclassifier
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5,8,12]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.2)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [7]:
# BiDirectional LSTM

def model_bilstm(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    '''
    Here 64 is the size(dim) of the hidden state vector as well as the output vector. Keeping return_sequence we want the output for the entire sequence. So what is the dimension of output for this layer?
        64*70(maxlen)*2(bidirection concat)
    CuDNNLSTM is fast implementation of LSTM layer in Keras which only runs on GPU
    '''
    x = Bidirectional(CuDNNLSTM(200, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(200, activation="relu")(conc)
    conc = Dropout(0.2)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
def train_pred(model, epochs=2):
    filepath="weights_best.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
    earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto')
    callbacks = [checkpoint, reduce_lr]
    
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y),callbacks=callbacks)
        
    model.load_weights(filepath)
    pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y

In [9]:
'''
A function specific to this competition since the organizers don't want probabilities 
and only want 0/1 classification maximizing the F1 score. This function computes the best F1 score by looking at val set predictions
'''

def f1_smart(y_true, y_pred):
    thresholds = []
    for thresh in np.arange(0.3, 0.7, 0.01):
        thresh = np.round(thresh, 2)
        res = metrics.f1_score(y_true, (y_pred > thresh).astype(int), average='macro')
        thresholds.append([thresh, res])
#         print("F1 score at threshold {0} is {1}".format(thresh, res))

    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    best_f1 = thresholds[0][1]
    print("Best threshold: ", best_thresh)
    return  best_f1, best_thresh

# OLID small

## CNN

In [10]:
train_X, val_X, test_X, train_y, val_y, word_index = preproc(olid_train_small, olid_test)
embedding_matrix = create_embeddings(word_index)
model = model_cnn(embedding_matrix)
# model.summary()

Train shape :  (5852, 2)
Test shape :  (860, 2)


In [None]:
pred_val_y, pred_test_y = train_pred(model, epochs=25)

Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.66830, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.66830 to 0.59742, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.59742 to 0.54447, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.54447 to 0.50402, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.50402 to 0.49828, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.49828 to 0.47873, saving model to weights_best.h5
Train on 5266 samples, validate on 586 samples
Epoch 1/1

Epoch 00001: val_loss improved from 0.47873 to 0.47475, saving model t

In [None]:
f1, threshold = f1_smart(val_y, pred_val_y)
print('Optimal F1: {} at threshold: {}'.format(f1, threshold))

pred_test_y = (pred_test_y >threshold).astype(int)
olid_test['CNN_olid_small'] = pred_test_y
olid_test.to_csv("results.csv")

## BiLSTM

In [None]:
train_X, val_X, test_X, train_y, val_y, word_index = preproc(olid_train_small, olid_test)
embedding_matrix = create_embeddings(word_index)
model = model_bilstm(embedding_matrix)
# model.summary()

In [None]:
pred_val_y, pred_test_y = train_pred(model, epochs=25)

In [None]:
f1, threshold = f1_smart(val_y, pred_val_y)
print('Optimal F1: {} at threshold: {}'.format(f1, threshold))

pred_test_y = (pred_test_y >threshold).astype(int)
olid_test['BiLSTM_olid_small'] = pred_test_y
olid_test.to_csv("results.csv")

# HASOC

## CNN

In [None]:
train_X, val_X, test_X, train_y, val_y, word_index = preproc(hasoc_train, olid_test)
embedding_matrix = create_embeddings(word_index)
model = model_cnn(embedding_matrix)
# model.summary()

In [None]:
pred_val_y, pred_test_y = train_pred(model, epochs=25)

In [None]:
f1, threshold = f1_smart(val_y, pred_val_y)
print('Optimal F1: {} at threshold: {}'.format(f1, threshold))

pred_test_y = (pred_test_y >threshold).astype(int)
olid_test['CNN_hasoc'] = pred_test_y
olid_test.to_csv("results.csv")

## BiLSTM

In [None]:
train_X, val_X, test_X, train_y, val_y, word_index = preproc(hasoc_train, olid_test)
embedding_matrix = create_embeddings(word_index)
model = model_bilstm(embedding_matrix)
# model.summary()

In [None]:
pred_val_y, pred_test_y = train_pred(model, epochs=25)

In [None]:
f1, threshold = f1_smart(val_y, pred_val_y)
print('Optimal F1: {} at threshold: {}'.format(f1, threshold))

pred_test_y = (pred_test_y >threshold).astype(int)
olid_test['BiLSTM_hasoc'] = pred_test_y
olid_test.to_csv("results.csv")

## References

* CNN Notebook: https://www.kaggle.com/code/mlwhiz/learning-text-classification-textcnn/notebook
* BiLSTM Notebook: https://www.kaggle.com/code/mlwhiz/bilstm-pytorch-and-keras#b.-Runing-Keras-Model

* Based on SRK's kernel: https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* Vladimir Demidov's 2DCNN textClassifier: https://www.kaggle.com/yekenot/2dcnn-textclassifier
* Shujian's https://www.kaggle.com/shujian/fork-of-mix-of-nn-models