In [29]:
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, Conv1D, Activation, BatchNormalization

In [12]:
df_en = pickle.load(open('en.p','rb'))

In [4]:
embedding_dict={}
with open('glove.6B.300d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float64')
        embedding_dict[word]=vectors
f.close()

In [5]:
def create_corpus_new(df):
    corpus=[]
    for review in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(review)]
        corpus.append(words)
    return corpus   

In [13]:
corpus = create_corpus_new(df_en)

100%|██████████| 7264/7264 [00:00<00:00, 8099.18it/s]


In [15]:
MAX_LEN=150
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)
tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [16]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 17245


In [17]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,300))

for word,i in tqdm(word_index.items()):
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec           

100%|██████████| 17245/17245 [00:00<00:00, 338787.41it/s]


In [18]:
tweet_pad.shape

(7264, 150)

In [21]:
pd.get_dummies(df_en.target)

Unnamed: 0,0,1
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
7608,0,1
7609,0,1
7610,0,1
7611,0,1


In [22]:
tweet_pad.shape

(7264, 150)

In [23]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(tweet_pad, df_en.target)



In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y ,test_size=0.2)

In [26]:
from sklearn.model_selection import train_test_split
import pandas as pd
from keras.preprocessing import sequence, text

total_vocab = set(word for text in df_en.text for word in text.split(' '))

tokenizer = text.Tokenizer(num_words=len(total_vocab))
tokenizer.fit_on_texts(df_en.text)
tokenized_list = tokenizer.texts_to_sequences(df_en.text)
padded_seq = sequence.pad_sequences(tokenized_list)
target_dummies = pd.get_dummies(df_en.target)

X_train, X_test, y_train, y_test = train_test_split(padded_seq, pd.get_dummies(df_en.target), test_size=0.2)


In [32]:
model = Sequential()
embedding_size=150
model.add(Embedding(len(total_vocab), embedding_size))
model.add(Conv1D(64, kernel_size=5, padding='valid', activation='softmax', strides=1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy', 'mse'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 150)         2587050   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 64)          48064     
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 300)         258000    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 602       
Total params: 2,893,716
Trainable params: 2,893,716
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)

print('-------Accuracy-------', '\n')
print(model.evaluate(X_test, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4648 samples, validate on 1163 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
-------Accuracy------- 

[0.8427728165123106, 0.7704748511314392, 0.19173911213874817]


In [37]:
model.metrics_names

['loss', 'accuracy', 'mse']

In [38]:
test = pd.read_csv('test.csv')

In [39]:
test.keyword.value_counts()

deluged          23
rubble           22
demolished       22
sirens           21
obliteration     21
                 ..
fatalities        5
threat            5
forest%20fire     5
inundation        4
epicentre         1
Name: keyword, Length: 221, dtype: int64

In [40]:
tokenized_list_test = tokenizer.texts_to_sequences(test.text)
padded_seq = sequence.pad_sequences(tokenized_list_test)

In [41]:
y_pred = model.predict(padded_seq)

In [42]:
test_list = []

for i in y_pred:
    test_list.append(np.argmax(i))

In [43]:
sample_sub = pd.read_csv('sample_submission.csv')

In [44]:
sample_sub

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [45]:
result = pd.DataFrame({'id':sample_sub['id'], 'target': np.array(test_list)})

In [46]:
result

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [47]:
result.set_index('id', inplace=True)

In [48]:
result

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1


In [49]:
result.to_csv('result.csv')