In [31]:
import tensorflow as tf
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import sys
import io
import pandas as pd
from gensim.corpora.dictionary import Dictionary
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
!pip3 install nltk
from nltk.tokenizer import TweetTokenizer

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 456 kB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434674 sha256=9d81b24e719560627ee385349b9584d33cd6dafe8d869f533fc125051f0ec73c
  Stored in directory: /Users/alditopalli/Library/Caches/pip/wheels/ff/d5/7b/f1fb4e1e1603b2f01c2424dd60fbcc50c12ef918bafc44b155
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.5


ModuleNotFoundError: No module named 'nltk.tokenizer'

In [9]:
df_train = pd.read_csv("../data/nlp-getting-started/train.csv", encoding="utf-8")
df_test = pd.read_csv("../data/nlp-getting-started/test.csv", encoding="utf-8")

In [18]:
with open("../data/glove.twitter.27B.50d.txt", "r") as f:
    dict_w2v = {}
    problems = []
    
    for line in tqdm(f):
        
        tokens = line.split()
        
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            problems.append({word: vector})

1193514it [00:23, 49938.86it/s]


(49,)

In [28]:
def clean_data(df):
    #remove html tags
    df["text"] = df["text"].apply(lambda x: re.sub(r'<.*?>', '', x))
    
    # replace urls with <url> tag
    df["text"] = df["text"].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '<url>', x))
    # replace user names with <user> tag
    df["text"] = df["text"].apply(lambda x: re.sub(r'@[a-zA-Z0-9_]+', '<user>', x))
    # replace hashtags with <hashtag> tag
    df["text"] = df["text"].apply(lambda x: re.sub(r'#[a-zA-Z0-9_]+', '<hashtag>', x))
    # replace noisy words - here it can be improved
    df["text"] = df["text"].apply(lambda x: x.replace("\x89", "").replace("hÛ_", "").replace("ÛÓ", ""))
    # replace the happy emojis with <smile> tag
    df["text"] = df["text"].apply(lambda x: re.sub(r'(:|;)-?(\)|D|d)', "<smile>", x))
    # replace the sad emojis with <smile> tag
    df["text"] = df["text"].apply(lambda x: re.sub(r'(:|;)-?\(+', "<sad>", x))
    
    return df

In [29]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [41]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()

tokens_train = [tokenizer.tokenize(tweet) for tweet in df_train["text"]]
tokens_test = [tokenizer.tokenize(tweet) for tweet in df_test["text"]]

vocab = Dictionary(tokens_train + tokens_test)

special_tokens = {"<pad>": 0}
vocab.patch_with_special_tokens(special_tokens)

X_train = [vocab.doc2idx(token) for token in tokens_train]
y_train = df_train["target"].values
X_test  = [vocab.doc2idx(token) for token in tokens_test]

w2v_train = [[dict_w2v[token] if token in dict_w2v else dict_w2v["<unknown>"] for token in list_tokens]
             for list_tokens in tokens_train]
w2v_test  = [[dict_w2v[token] if token in dict_w2v else dict_w2v["<unknown>"] for token in list_tokens]
             for list_tokens in tokens_test]

In [44]:
X_train = pad_sequences(X_train)

w2v_train = np.array([w2_seq + [np.zeros(50)] * (X_train.shape[1] - len(w2_seq)) for w2_seq in w2v_train])

X_test = pad_sequences(X_test)
w2v_test = np.array([w2_seq + [np.zeros(50)] * (X_test.shape[1] - len(w2_seq)) for w2_seq in w2v_test])

In [48]:
input_tokens = tf.keras.layers.Input(shape=X_train.shape[1], name="input_tokens")
input_w2v = tf.keras.layers.Input(shape=(X_train.shape[1], 50), name="input_w2v")
embeddings = tf.keras.layers.Embedding(len(vocab.token2id)+1, 25, mask_zero=True)(input_tokens)
embeddings = tf.keras.layers.SpatialDropout1D(0.2)(embeddings)
lstms = tf.keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.2)(tf.concat((embeddings, input_w2v), axis=2))
outputs = tf.keras.layers.Dense(1, activation="sigmoid", kernel_regularizer=tf.keras.regularizers.l2(0.01))(lstms)

model = tf.keras.models.Model(inputs=[input_tokens, input_w2v], outputs=outputs)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

In [49]:
early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=5, restore_best_weights=True)
model.fit([X_train, w2v_train], y_train, batch_size=32, epochs=15, validation_split=0.1, callbacks=[early_stop_cb])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x150ecb6d0>

In [51]:
y_pred = model([X_test, w2v_test])
y_pred = [0 if y_pred_val < 0.5 else 1 for y_pred_val in y_pred]

df_pred = pd.DataFrame(df_test["id"])
df_pred["target"] = y_pred
df_pred.to_csv("submission.csv", index=False)

In [54]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about <hashtag> is different cities, sta..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. <hashtag> <hashtag>
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTEN...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago <url>
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) <url>
