# NLP Quick Start for newbie😁 - with 9 steps

## INDEX
```
Step 1. Library Import & Data Load
Step 2. Data Preprocessing
     2-a. Drop Columns
     2-b. Tokenizer
     2-c. Pad Sequences
     2-d. Match Data type to numpy.ndarray
Step 3. Modeling
Step 4. Model Compile
Step 5. Callbacks
Step 6. Model Fit
Step 7. Model Evaluate & Save
Step 8. Reload Model
Step 9. Predict Test Data
```
---

## Step 1. Library Import & Data Load

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
test_df.nunique()

id          3263
keyword      221
location    1602
text        3243
dtype: int64

## Step 2. Data Preprocessing

### 2-a. Drop Columns

In [9]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)

In [11]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
test_df.drop(columns=['keyword','location'],axis=1, inplace=True)

In [13]:
print(train_df.shape, test_df.shape)

(7613, 2) (3263, 2)


### 2-b. Tokenizer

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'],train_df['target'], test_size=0.2, random_state=111)

In [16]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(6090,) (6090,) (1523,) (1523,)


In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
vocab_size = 1000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

In [19]:
tokenizer.fit_on_texts(X_train)

In [20]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

In [21]:
for i in range(10):
    print(len(X_train[i]))

19
20
18
26
16
22
19
18
14
8


In [22]:
X_train[0]

[132, 1, 9, 324, 12, 16, 679, 13, 4, 2, 3, 1, 616, 160, 1, 4, 2, 3, 1]

In [23]:
for i in range(10):
    print(len(X_valid[i]))

19
26
26
23
12
11
20
15
11
10


In [24]:
X_valid[0]

[73, 15, 231, 507, 18, 751, 397, 922, 175, 5, 1, 4, 2, 3, 1, 4, 2, 3, 1]

### 2-c. Pad Sequences

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
max_length = 120
trunc_type = 'post'
pad_type = 'post'

In [27]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [28]:
X_train_padded[:2]

array([[132,   1,   9, 324,  12,  16, 679,  13,   4,   2,   3,   1, 616,
        160,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [826,   1, 140,  26,   1,   7,   6,   1,  23, 156,   1,  71, 181,
         14,   1,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [29]:
X_valid_padded[:2]

array([[ 73,  15, 231, 507,  18, 751, 397, 922, 175,   5,   1,   4,   2,
          3,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 19,   1,  51, 551,  54,   7, 840,   6,   1,   1,  10,  79,  50,
        422,  34,  20,   6,   1,  10,  50,   1,  13,   1,  33,   1,   1,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [30]:
print(X_train_padded.shape, X_valid_padded.shape)

(6090, 120) (1523, 120)


### 2-d. Match Data type to numpy.ndarray

In [None]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

In [31]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [32]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


## Step 3. Modeling

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten

In [34]:
embedding_dim = 16
# vocab_size = 1000
# max_length = 120

In [35]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

## Step 4. Model Compile

In [37]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

## Step 5. Callbacks

In [38]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [39]:
filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

In [40]:
ep = EarlyStopping(
    monitor='val_loss', 
    patience=5,
)

## Step 6. Model Fit

In [41]:
epochs=30
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp,ep],
    epochs=epochs
)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.48233, saving model to my_checkpoint.ckpt
Epoch 2/30

Epoch 00002: val_loss improved from 0.48233 to 0.46487, saving model to my_checkpoint.ckpt
Epoch 3/30

Epoch 00003: val_loss improved from 0.46487 to 0.45732, saving model to my_checkpoint.ckpt
Epoch 4/30

Epoch 00004: val_loss improved from 0.45732 to 0.44602, saving model to my_checkpoint.ckpt
Epoch 5/30

Epoch 00005: val_loss did not improve from 0.44602
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.44602
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.44602
Epoch 8/30

Epoch 00008: val_loss did not improve from 0.44602
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.44602


<tensorflow.python.keras.callbacks.History at 0x7eee2c608f90>

## Step 7. Model Evaluate & Save

In [42]:
model.load_weights(filepath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7eeda8642bd0>

In [43]:
model.evaluate(X_valid_padded, y_valid)



[0.4460151195526123, 0.7997373342514038]

In [44]:
X_valid[0]

[73, 15, 231, 507, 18, 751, 397, 922, 175, 5, 1, 4, 2, 3, 1, 4, 2, 3, 1]

In [45]:
model.save('./model/basic_nlp.h5')

## Step 8. Reload Model

In [46]:
import tensorflow as tf

In [47]:
mymodel = tf.keras.models.load_model('./model/basic_nlp.h5')

In [48]:
mymodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

## Step 9. Predict Test Data 

In [49]:
X_test = tokenizer.texts_to_sequences(test_df['text'])

In [50]:
X_test_padded = pad_sequences(X_test, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [51]:
y_test_raw = model.predict(X_test_padded)

In [52]:
y_test_raw

array([[0.90136766],
       [0.9085539 ],
       [0.9668032 ],
       ...,
       [0.9918337 ],
       [0.9479617 ],
       [0.20135483]], dtype=float32)

In [53]:
y_test = list(map(lambda x : 1 if x > 0.5 else 0, y_test_raw))

In [54]:
set(y_test)

{0, 1}

In [55]:
y_test[:5]

[1, 1, 1, 0, 1]

In [56]:
test_df['predict'] = y_test

In [57]:
test_df

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,1


In [58]:
test_df[test_df['predict']==1]

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,We're shaking...It's an earthquake,1
...,...,...,...
3257,10858,The death toll in a #IS-suicide car bombing on...,1
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1


In [59]:
submission = test_df[['id','predict']]

In [60]:
submission

Unnamed: 0,id,predict
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [61]:
submission.columns = ['id', 'target']

In [62]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [63]:
submission.to_csv('./sample_submission.csv', index=False)