In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


In [2]:
os.environ['PYTHONHASHSEED'] = '123'
np.random.seed(123)
random.seed(123)
tf.random.set_seed(123)

# Data

In [3]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
for i in df_train.index:
    df_train.loc[i, 'text'] = df_train.loc[i, 'text'].lower()
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this #earthquake m...,1
1,4,,,forest fire near la ronge sask. canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,just got sent this photo from ruby #alaska as ...,1


In [6]:
for i in df_test.index:
    df_test.loc[i, 'text'] = df_test.loc[i, 'text'].lower()
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,"heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,apocalypse lighting. #spokane #wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [7]:
df_train.drop(['keyword', 'location'], axis = 1, inplace=True)
df_test.drop(['keyword', 'location'], axis = 1, inplace=True)
df_train.head()

Unnamed: 0,id,text,target
0,1,our deeds are the reason of this #earthquake m...,1
1,4,forest fire near la ronge sask. canada,1
2,5,all residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,just got sent this photo from ruby #alaska as ...,1


In [8]:
for i in df_train.loc[df_train.text.str.contains('http://[^ ]+|https://[^ ]+|www.[^ ]+')].index:
    df_train.loc[i, 'text'] = re.sub('http://[^ ]+|https://[^ ]+|www.[^ ]+', 'URL', df_train.loc[i, 'text'])
df_train.loc[df_train.text.str.contains('http://[^ ]+|https://[^ ]+|www.[^ ]+')].count()

id        0
text      0
target    0
dtype: int64

In [9]:
for i in df_test.loc[df_test.text.str.contains('http://[^ ]+|https://[^ ]+|www.[^ ]+')].index:
    df_test.loc[i, 'text'] = re.sub('http://[^ ]+|https://[^ ]+|www.[^ ]+', 'URL', df_test.loc[i, 'text'])
df_test.loc[df_test.text.str.contains('http://[^ ]+|https://[^ ]+|www.[^ ]+')].count()

id      0
text    0
dtype: int64

In [10]:
for i in df_train.loc[df_train.text.str.contains('@[^ ]')].index:
    df_train.loc[i, 'text'] = re.sub('@[^ ]', '@', df_train.loc[i, 'text'])
df_train.loc[df_train.text.str.contains('@[^ ]')].count()

id        2015
text      2015
target    2015
dtype: int64

In [11]:
for i in df_test.loc[df_test.text.str.contains('@[^ ]')].index:
    df_test.loc[i, 'text'] = re.sub('@[^ ]', '@', df_test.loc[i, 'text'])
df_test.loc[df_test.text.str.contains('@[^ ]')].count()

id      918
text    918
dtype: int64

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict

stopw = set(stopwords.words('english'))
stopw.update(['\'m'])
wordlist = defaultdict(int)
for i in df_train.text:
    w = word_tokenize(i)
    for j in w:
        if j in stopw:
            continue
        wordlist[j] += 1
for i in df_test.text:
    w = word_tokenize(i)
    for j in w:
        if j in stopw:
            continue
        wordlist[j] += 1
print(len(wordlist))

23466


In [13]:
wl = sorted(wordlist, key = lambda x:wordlist[x], reverse = True)
for i in range(20):
    print(wl[i], wordlist[wl[i]])

URL 6765
# 4820
? 4446
. 4268
@ 4032
: 2847
! 1666
... 1530
's 1117
- 1094
' 966
; 835
& 651
n't 621
) 529
amp 510
( 504
like 489
fire 357
get 333


In [14]:
cnt = 0
for i in wl:
    if wordlist[i] < 20:
        cnt += 1
print(cnt)

22393


In [15]:
vocab_size = 10000 - 2
vocab = {}
cnt = 1

for i in sorted(list(wordlist.items()), key=lambda x: x[1], reverse=True)[:vocab_size]:
    vocab[i[0]] = cnt
    cnt += 1
print(len(vocab))

9998


In [16]:
maxlen = 50

train_x = []

for i in df_train.index:
    W = word_tokenize(df_train.loc[i, 'text'])
    for w in range(len(W)):
        try:
            W[w] = vocab[W[w]]
        except:
            W[w] = vocab_size
    if len(W) < maxlen:
        W += [0 for i in range(maxlen-len(W))]
    train_x.append(W[:maxlen])
print(len(train_x))

7613


In [17]:
FullDataset = tf.data.Dataset.from_tensor_slices({'x':train_x, 'y':df_train['target']})
FullDataset = FullDataset.shuffle(10000)
print(len(FullDataset))

7613


In [18]:
TrainDataset = FullDataset.skip(len(FullDataset)//10)
print(len(TrainDataset))

6852


In [19]:
ValidDataset = FullDataset.take(len(FullDataset)//10)
print(len(ValidDataset))

761


In [20]:
maxlen = 50

test_x = []

for i in df_test.index:
    W = word_tokenize(df_test.loc[i, 'text'])
    for w in range(len(W)):
        try:
            W[w] = vocab[W[w]]
        except:
            W[w] = vocab_size
    if len(W) < maxlen:
        W += [0 for i in range(maxlen-len(W))]
    test_x.append(W[:maxlen])
print(len(test_x))

3263


In [21]:
TestDataset = tf.convert_to_tensor(test_x)
print(len(TestDataset))

3263


# Model

## Positional Encoding

In [22]:
class PositionalEncoding(layers.Layer):
    def __init__(self, pos, dim):
        super(PositionalEncoding, self).__init__()
        pos = tf.range(pos, dtype=tf.float32)[:, tf.newaxis],
        i = tf.range(dim, dtype=tf.float32)[tf.newaxis, :]
        self.encoding = self.calc(pos, i, dim)
        
    def calc(self, pos, i, dim):
        ret = pos / (tf.math.pow(10000, (i//2)/tf.cast(dim, dtype=float)))
        return ret
        
    def call(self, inputs):
        return inputs + self.encoding[:, :tf.shape(inputs)[1], :]

## Multi Head Self Attention

In [23]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim=512, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError('embed_dim % num_heads != 0')
        
        self.key = layers.Dense(embed_dim)
        self.value = layers.Dense(embed_dim)
        self.query = layers.Dense(embed_dim)
        self.concat_dense = layers.Dense(embed_dim)
        
    def attention(self, key, value, query):
        matm = tf.matmul(query, key, transpose_b=True)
        key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled = matm / tf.math.sqrt(key_dim)
        attention_weights = tf.nn.softmax(scaled, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights
    
    def seperate_heads(self, x, batch_size):
        x = tf.reshape(x, [batch_size, -1, self.num_heads, self.embed_dim // self.num_heads])
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        key = self.key(inputs)
        value = self.value(inputs)
        query = self.query(inputs)
        key = self.seperate_heads(key, batch_size)
        value = self.seperate_heads(value, batch_size)
        query = self.seperate_heads(query, batch_size)
        attention, attention_weights = self.attention(key, value, query)
        attention = tf.transpose(attention, perm=[0,2,1,3])
        attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.concat_dense(attention)
        return output

## Transformer Block

In [24]:
class TransformerLayer(layers.Layer):
    def __init__(self, dim=512, dff=2048, num_head=8, dropout_rate=0.1):
        super(TransformerLayer, self).__init__()
        
        self.MultiHeadSelfAttention = MultiHeadSelfAttention(dim, num_head)
        self.LayerNormalization1 = layers.LayerNormalization()
        self.LayerNormalization2 = layers.LayerNormalization()
        self.ffn1 = layers.Dense(dff, activation='relu')
        self.ffn2 = layers.Dense(dim)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        
    def call(self, inputs):
        x = self.MultiHeadSelfAttention(inputs)
        x = self.LayerNormalization1(x+inputs)
        x = self.dropout1(x)
        
        output = self.ffn1(x)
        output = self.ffn2(output)
        output = self.dropout2(output)
        
        output = self.LayerNormalization2(x+output)
        
        return output

## Transformer

In [25]:
dim = 512
dff = 2048
num_heads = 8
num_layers = 1
dropout_rate = 0.1
maxlen = 50
vocab_size = 10000

inputs = keras.Input((maxlen,))
x = layers.Embedding(vocab_size, dim)(inputs)
x = PositionalEncoding(vocab_size, dim)(x)

for _ in range(num_layers):
    x = TransformerLayer(dim, dff, num_heads, dropout_rate)(x)

output = layers.GlobalAveragePooling1D()(x)
output = layers.Dropout(dropout_rate)(output)
output = layers.Dense(16)(output)
output = layers.Dropout(dropout_rate)(output)
output = layers.Dense(1, activation='sigmoid')(output)

model = keras.Model(inputs, output)

In [26]:
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [27]:
model.compile(optimizer='adam',
              loss='BinaryCrossentropy',
              metrics=[f1_score])

# Train

In [28]:
batch_size = 512
TrainDataset = TrainDataset.batch(batch_size)
ValidDataset = ValidDataset.batch(batch_size)

In [29]:
num_epochs = 20
best_f1 = 0

for epoch in range(num_epochs):
    model.reset_metrics()
    
    print('epoch {}'.format(epoch))
    print('train')
    TrainDataset.shuffle(10000)
    for idx, batch in enumerate(TrainDataset):
        result = model.train_on_batch(batch['x'], batch['y'])
        if idx % (len(TrainDataset) - 1) == 0:
            print('loss : {}, f1 score : {}'.format(result[0], result[1]))
        
    print('valid')
    for batch in ValidDataset:
        loss, f1_score = model.test_on_batch(batch['x'], batch['y'])
    print('loss : {}, f1 score : {}'.format(loss, f1_score))
    if f1_score > best_f1:
        best_f1 = f1_score
        model.save_weights('best.h5')

epoch 0
train
loss : 0.7785359621047974, f1 score : 0.33854159712791443
loss : 1.4610300064086914, f1 score : 0.02247190847992897
valid
loss : 1.2128864526748657, f1 score : 0.0
epoch 1
train
loss : 1.2051515579223633, f1 score : 0.03463202714920044
loss : 0.7592166066169739, f1 score : 0.08791207522153854
valid
loss : 0.6527432799339294, f1 score : 0.0
epoch 2
train
loss : 0.7367075681686401, f1 score : 0.27627620100975037
loss : 0.7270467281341553, f1 score : 0.5520361661911011
valid
loss : 0.7395619750022888, f1 score : 0.5932202935218811
epoch 3
train
loss : 0.747222900390625, f1 score : 0.5903614163398743
loss : 0.7292758226394653, f1 score : 0.4480873942375183
valid
loss : 0.6714686155319214, f1 score : 0.0
epoch 4
train
loss : 0.6983681917190552, f1 score : 0.4050632417201996
loss : 0.7168970108032227, f1 score : 0.07207205146551132
valid
loss : 0.6692432165145874, f1 score : 0.0
epoch 5
train
loss : 0.6696380972862244, f1 score : 0.20512817800045013
loss : 0.6655207276344299, f

# Predict

In [30]:
model.load_weights('best.h5')

In [31]:
target = np.round(model.predict(TestDataset)).astype(int)

In [32]:
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [33]:
submission.tail()

Unnamed: 0,id
3258,10861
3259,10865
3260,10868
3261,10874
3262,10875


In [34]:
submission['target'] = target
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1


In [35]:
submission.to_csv('submission.csv', index=False)