In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
df1 = pd.read_table("assignment/1.txt", header=None)
df2 = pd.read_table('assignment/2.txt', header=None)
df3 = pd.read_csv('assignment/3.txt')

  """Entry point for launching an IPython kernel.
  


In [3]:
df1.columns = ['Text', 'Prediction']
df2.columns = ['Prediction', 'Text']
df2 = df2[['Text', 'Prediction']]

In [4]:
train_df = df1.append(df2, ignore_index=True)
test_df = df3[['Text', 'Prediction']]

In [5]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=2018)

In [48]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

In [49]:
## fill up the missing values
train_X = train_df["Text"].fillna("_na_").values
val_X = val_df["Text"].fillna("_na_").values
test_X = test_df["Text"].fillna("_na_").values

In [50]:
#make tokenizers
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['Prediction'].values
val_y = val_df['Prediction'].values

In [51]:
#without pretrained embeddings
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total para

In [52]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=3, validation_data=(val_X, val_y))

Train on 6094 samples, validate on 1524 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12b09aa2048>

In [53]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [54]:
for thresh in np.arange(0.4, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.4 is 0.9456210646823126
F1 score at threshold 0.41 is 0.946704871060172
F1 score at threshold 0.42 is 0.9482758620689655
F1 score at threshold 0.43 is 0.9487031700288183
F1 score at threshold 0.44 is 0.9486439699942297
F1 score at threshold 0.45 is 0.9485251590514749
F1 score at threshold 0.46 is 0.9495652173913044
F1 score at threshold 0.47 is 0.9493891797556718
F1 score at threshold 0.48 is 0.9487179487179487
F1 score at threshold 0.49 is 0.9467524868344059
F1 score at threshold 0.5 is 0.9467524868344059
F1 score at threshold 0.51 is 0.9466900995899238
F1 score at threshold 0.52 is 0.9471830985915493
F1 score at threshold 0.53 is 0.9452619187757504
F1 score at threshold 0.54 is 0.9469339622641509
F1 score at threshold 0.55 is 0.9485511531638083
F1 score at threshold 0.56 is 0.9478672985781991
F1 score at threshold 0.57 is 0.9477434679334916
F1 score at threshold 0.58 is 0.9476813317479192
F1 score at threshold 0.59 is 0.9462365591397849
F1 score at threshold 0

In [55]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [14]:
del model, inp, x
import gc; gc.collect()
time.sleep(5)

In [57]:
#using Glove embeddings

EMBEDDING_FILE = r'E:\DATA SETS & compressed and weights\glove.6B\glove.6B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8'))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

  """


In [58]:
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i-1] = embedding_vector

In [59]:
inp = Input(shape=(maxlen,))
x = Embedding(embedding_matrix.shape[0], embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 300)          1156200   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 17        
Total para

In [60]:
model.fit(train_X, train_y, batch_size=512, epochs=3, validation_data=(val_X, val_y))

Train on 6094 samples, validate on 1524 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12b16c622e8>

In [61]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [62]:
for thresh in np.arange(0.4, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.4 is 0.931740614334471
F1 score at threshold 0.41 is 0.9316628701594533
F1 score at threshold 0.42 is 0.9327251995438997
F1 score at threshold 0.43 is 0.930817610062893
F1 score at threshold 0.44 is 0.9304997128087307
F1 score at threshold 0.45 is 0.9308755760368663
F1 score at threshold 0.46 is 0.9324870167339874
F1 score at threshold 0.47 is 0.9316338354577057
F1 score at threshold 0.48 is 0.931554524361949
F1 score at threshold 0.49 is 0.931554524361949
F1 score at threshold 0.5 is 0.9320951828206616
F1 score at threshold 0.51 is 0.9317784256559768
F1 score at threshold 0.52 is 0.9323220536756127
F1 score at threshold 0.53 is 0.9336465061655901
F1 score at threshold 0.54 is 0.9329411764705883
F1 score at threshold 0.55 is 0.9328621908127208
F1 score at threshold 0.56 is 0.9309734513274337
F1 score at threshold 0.57 is 0.9303423848878395
F1 score at threshold 0.58 is 0.9290780141843972
F1 score at threshold 0.59 is 0.9289940828402368
F1 score at threshold 0.6 

In [63]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [71]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(5)

In [64]:
pred_val_y = 0.5*pred_glove_val_y + 0.5*pred_noemb_val_y

In [66]:
for thresh in np.arange(0.5, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.5 is 0.9454123112659697
F1 score at threshold 0.51 is 0.9446709376820035
F1 score at threshold 0.52 is 0.9438596491228071
F1 score at threshold 0.53 is 0.9442160892542573
F1 score at threshold 0.54 is 0.9434628975265018
F1 score at threshold 0.55 is 0.9409681227863046
F1 score at threshold 0.56 is 0.9402013025458852
F1 score at threshold 0.57 is 0.9387269482450923
F1 score at threshold 0.58 is 0.9373881932021467
F1 score at threshold 0.59 is 0.9385074626865673
F1 score at threshold 0.6 is 0.9377245508982037


In [67]:
pred_test_y = 0.5*pred_glove_test_y + 0.5*pred_noemb_test_y

In [68]:
pred_test_y = (pred_test_y>0.5).astype(int)

In [69]:
out_df =  pd.DataFrame(test_df)
out_df['Prediction'] = pred_test_y

In [70]:
out_df.to_csv("out.csv")

In [56]:
pred_noemb_test_y = (pred_noemb_test_y>0.5).astype(int)
out_df2 =  pd.DataFrame(test_df)
out_df2['Prediction'] = pred_noemb_test_y
out_df2.to_csv("out_2.csv")

In [75]:
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

cnt_srs = out_df['Prediction'].value_counts()
trace = go.Bar(
    x=cnt_srs.index,
    y=cnt_srs.values,
    marker=dict(
        color=cnt_srs.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)

layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="TargetCount")

## target distribution ##
labels = (np.array(cnt_srs.index))
sizes = (np.array((cnt_srs / cnt_srs.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution for ensemble model',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="usertype")

In [76]:
cnt_srs = out_df2['Prediction'].value_counts()
trace = go.Bar(
    x=cnt_srs.index,
    y=cnt_srs.values,
    marker=dict(
        color=cnt_srs.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)

layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="TargetCount")

## target distribution ##
labels = (np.array(cnt_srs.index))
sizes = (np.array((cnt_srs / cnt_srs.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution for no embeddings model',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="usertype")

In [80]:
df3['Prediction'] = out_df['Prediction']
df3.columns = [' ', 'index', 'Text', 'Prediction']

In [87]:
df3.to_csv('3.txt', index=False)