In [1]:
import numpy as np
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import urllib
np.random.seed(100)

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Activation, Flatten,Dropout,Input,Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.losses import mean_squared_error
import keras.backend as K


Using TensorFlow backend.


In [2]:
glove_file = datapath("/home/aims/Downloads/glove.twitter.27B/glove.twitter.27B.100d.txt")
tmp_file = get_tmpfile("glove_to_w2v.txt")
_ = glove2word2vec(glove_file, tmp_file)

In [3]:
model = KeyedVectors.load_word2vec_format(tmp_file)

In [4]:
wv = model
words = list(wv.wv.vocab.keys())
word_to_index, index_to_word = dict(),dict()
for i,word in enumerate(words):
    word_to_index[word]=i
    index_to_word[i]=word
print("vocabulary size= ",len(words)," words")

  


vocabulary size=  1193514  words


In [5]:
def vectorise(words,wv_model,max_length=35):
    words = words.lower().split()
    vectors = [wv_model[word] for word in words]
    return append_zeros(vectors,max_length)
def append_zeros(words,max_length):
    for i in range(max_length-len(words)):
        words.append(np.zeros(300))
    return np.array(words)

def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len),dtype=int)
    for i in range(m):
        sentence_words = X[i].split()
        j = 0
        for w in sentence_words:
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
            j = j+1
            if j>=max_len:
                break
    return X_indices

In [6]:
def read_data(path,testing=False):
    data = pd.read_csv(path)
    
    data["keyword"].replace(np.nan,"-",inplace=True)
    data["keyword"] = data["keyword"].apply(lambda x:urllib.parse.unquote(x))
    
    data["location"].replace(np.nan,"-",inplace=True)
    data["location"] = data["location"].apply(lambda x: re.sub("[^-a-zA-Z\s]","",x))
    
    data["text"] = data["text"].apply(lambda x: re.sub("[^\w\s#'_]","",x)).apply(lambda x:x.lower())
    data["text"] = data["text"].apply(lambda x: " ".join([a for a in re.split("([#$])",x) if len(a)!=0]))
    
    
    new_data = pd.DataFrame()
    new_data["id"] = data["id"]
    
    new_data["text"] = data["keyword"] + " " + data["location"]+ " " + data["text"]
    
    
    if not testing:
        new_data["target"] = data["target"]
    return new_data

In [45]:
MAX_LENGTH = 35

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["hello"].shape[0]
    emb_matrix = np.zeros((vocab_len,emb_dim))
                                            
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
        
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

def logistic_regression(input_shape, dropout_prob,n_h):
    input_data = Input(shape=input_shape)
    embedding_layer = pretrained_embedding_layer(wv,word_to_index)(input_data)
    X = Flatten()(embedding_layer)
    X = Dense(n_h,activation="tanh")(X)
    X = Dropout(dropout_prob)(X)
    X = Dense(1)(X)
    X = Activation("sigmoid")(X)
           
    model = Model(inputs=input_data, outputs=X)
    return model
model = logistic_regression((MAX_LENGTH,), 0.9, 128)



In [46]:
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 35)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 35, 100)           119351500 
_________________________________________________________________
flatten_5 (Flatten)          (None, 3500)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               448128    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 129       
_________________________________________________________________
activation_6 (Activation)    (None, 1)                 0   

In [47]:
train_path = "./data/train.csv"
train_data = read_data(train_path)
train_data.head()

Unnamed: 0,id,text,target
0,1,- - our deeds are the reason of this # earthq...,1
1,4,- - forest fire near la ronge sask canada,1
2,5,- - all residents asked to 'shelter in place' ...,1
3,6,- - 13000 people receive # wildfires evacuati...,1
4,7,- - just got sent this photo from ruby # alas...,1


In [48]:
X_train, X_test, y_train, y_test = train_test_split(train_data["text"], train_data["target"], test_size=0.33, random_state=42)
X_train = sentences_to_indices(np.array(X_train.values),word_to_index,MAX_LENGTH)
X_test = sentences_to_indices(np.array(X_test.values),word_to_index,MAX_LENGTH)

In [49]:
model.compile(optimizer='adam',
              loss=mean_squared_error,
              metrics=['accuracy'])

In [50]:
callbacks = [EarlyStopping(
                    monitor='val_loss'
                    ,patience=10
                    ),
             ModelCheckpoint(
                     "best_model.h5"
                     ,monitor="val_loss"
                     ,mode="min"
                     ,save_best_only=True
                     ,verbose=True
                    )
            ]

In [51]:
model.fit(X_train, y_train, epochs=100, batch_size=32,validation_data=(X_test, y_test),shuffle=True,callbacks=callbacks)

Train on 5100 samples, validate on 2513 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.22460, saving model to best_model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.22460 to 0.17728, saving model to best_model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.17728 to 0.16516, saving model to best_model.h5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.16516
Epoch 5/100

Epoch 00005: val_loss improved from 0.16516 to 0.15392, saving model to best_model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.15392 to 0.15063, saving model to best_model.h5
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.15063
Epoch 8/100

Epoch 00008: val_loss improved from 0.15063 to 0.14728, saving model to best_model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 0.14728 to 0.14696, saving model to best_model.h5
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.14696
Epoch 11/100

Epoch 00011: val_loss improved from 0.14696 to 0.14375, 


Epoch 00039: val_loss did not improve from 0.13784
Epoch 40/100

Epoch 00040: val_loss did not improve from 0.13784


<keras.callbacks.callbacks.History at 0x7f6a6d84ae10>

In [16]:
Embedding?