In [1]:
import pandas as pd
import os
from matplotlib import pyplot as plt

In [2]:
current_directory = os.getcwd()
print(current_directory)
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

D:\boshi\GitHub\kears_learning\training_project\Natural Language Processing with Disaster Tweets


In [3]:
print(train.columns)
train.head(5)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_label = train.target.to_numpy()

In [6]:
y_train = train_label.reshape(-1)

In [7]:
train.drop(['id', 'keyword', 'location', 'target'], axis=1, inplace=True)
test.drop(['id', 'keyword', 'location'], axis=1, inplace=True)

In [8]:
train = train.to_numpy()
test = test.to_numpy()

In [9]:
X_train = train.reshape(-1)
X_test = test.reshape(-1)

In [10]:
print(X_train[0])

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


In [11]:
max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

In [12]:
import numpy as np
con = np.append(X_train, X_test)

In [13]:
con

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ..., 'Green Line derailment in Chicago http://t.co/UtbXLcBIuY',
       'MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3',
       '#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm'],
      dtype=object)

In [14]:
from keras.preprocessing.text import Tokenizer


# We create a tokenizer, configured to only take
# into account the top-1000 most common words
tokenizer = Tokenizer(num_words=max_features)
# This builds the word index
tokenizer.fit_on_texts(con)

# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(con)

# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 29319 unique tokens.


In [17]:
print(len(sequences))

10876


In [22]:
X_train = sequences[:len(X_train)]
X_test = sequences[len(X_train):]

In [24]:
from keras import layers, Model
from keras.metrics import AUC

def build_model():
    # model = Sequential()
    # model.add(layers.Embedding(input_length=max_features, input_dim=maxlen, output_dim=128))
    # model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
    # model.add(layers.Bidirectional(layers.LSTM(64)))
    # model.add(layers.Dense(1, activation="sigmoid"))
    # model.compile(optimizer='adam',
    #                   loss='binary_crossentropy',
    #                   metrics=[AUC(name = 'auc')])
    # Input for variable-length sequences of integers
    inputs = layers.Input(shape=(None,), dtype="int32")
    # Embed each integer in a 128-dimensional vector
    x = layers.Embedding(max_features, 128)(inputs)
    # Add 2 bidirectional LSTMs
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    # Add a classifier
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=[AUC(name = 'auc')])

    return model

In [26]:
model_summary = build_model()
model_summary.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,757,761
Trainable params: 2,757,761
Non-train

In [27]:
from keras.callbacks import *
from sklearn.metrics import roc_auc_score
import gc

from sklearn.model_selection import KFold
def fit_model(nfold, epochs=60, batch_size=32, verbose=False):
    test_preds = []
    auc = []
    kfold = KFold(n_splits=nfold, shuffle=True)

    for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train, y_train)):
        print(f"Fold: {fold+1}", end=' ')
        X_train_part, X_valid = X_train[train_idx], X_train[test_idx]
        y_train_part, y_valid = y_train[train_idx], y_train[test_idx]


        lr = ReduceLROnPlateau(monitor="val_auc", mode='max', factor=0.7, patience=4, verbose=False)
        es = EarlyStopping(monitor='val_auc',mode='max', patience=10, verbose=False,restore_best_weights=True)
        model = build_model()
        history = model.fit(X_train_part, y_train_part, validation_data=(X_valid, y_valid), epochs=epochs, batch_size=batch_size,
                            callbacks=[es,lr], verbose=verbose)

        y_pred = model.predict(X_valid).squeeze()
        auc_score = roc_auc_score(y_valid, y_pred)
        print(f'auc: {round(auc_score, 5)}')
        test_preds.append(model.predict(X_test).squeeze())
        auc.append(auc_score)
        del X_train_part, X_valid, y_train_part, y_valid, history, model
        gc.collect()


    return test_preds, auc

folds = 4
(test_preds, auc) = fit_model(folds, epochs=60, batch_size=64, verbose=True)

Fold: 1 

TypeError: only integer scalar arrays can be converted to a scalar index