# NLP Quick Start for newbie😁 - with 9 steps

## Step 1. Library Import & Data Load

In [65]:
import pandas as pd 
import numpy as np 

In [66]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [67]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [69]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [70]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [71]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [72]:
test_df.nunique()

id          3263
keyword      221
location    1602
text        3243
dtype: int64

## Step 2. Data Preprocessing

### 2-a. Drop Columns

In [73]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [74]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)

In [75]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [76]:
test_df.drop(columns=['keyword','location'],axis=1, inplace=True)

In [77]:
print(train_df.shape, test_df.shape)

(7613, 2) (3263, 2)


### 2-b. Tokenizer

In [78]:
from sklearn.model_selection import train_test_split

In [79]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'],train_df['target'], test_size=0.2, random_state=111)

In [80]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(6090,) (6090,) (1523,) (1523,)


In [81]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [82]:
vocab_size = 1000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

In [83]:
tokenizer.fit_on_texts(X_train)

In [84]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

In [85]:
for i in range(10):
    print(len(X_train[i]))

19
20
18
26
16
22
19
18
14
8


In [86]:
X_train[0]

[132, 1, 9, 324, 12, 16, 679, 13, 4, 2, 3, 1, 616, 160, 1, 4, 2, 3, 1]

In [87]:
for i in range(10):
    print(len(X_valid[i]))

19
26
26
23
12
11
20
15
11
10


In [88]:
X_valid[0]

[73, 15, 231, 507, 18, 751, 397, 922, 175, 5, 1, 4, 2, 3, 1, 4, 2, 3, 1]

### 2-c. Pad Sequences

In [89]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [90]:
max_length = 120
trunc_type = 'post'
pad_type = 'post'

In [91]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [92]:
X_train_padded[:2]

array([[132,   1,   9, 324,  12,  16, 679,  13,   4,   2,   3,   1, 616,
        160,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [826,   1, 140,  26,   1,   7,   6,   1,  23, 156,   1,  71, 181,
         14,   1,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [93]:
X_valid_padded[:2]

array([[ 73,  15, 231, 507,  18, 751, 397, 922, 175,   5,   1,   4,   2,
          3,   1,   4,   2,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 19,   1,  51, 551,  54,   7, 840,   6,   1,   1,  10,  79,  50,
        422,  34,  20,   6,   1,  10,  50,   1,  13,   1,  33,   1,   1,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,

In [94]:
print(X_train_padded.shape, X_valid_padded.shape)

(6090, 120) (1523, 120)


### 2-d. Match Data type to numpy.ndarray

In [95]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [96]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [97]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


## Step 3. Modeling

In [98]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten

In [99]:
embedding_dim = 16
# vocab_size = 1000
# max_length = 120

In [100]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [101]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

## Step 4. Model Compile

In [102]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

## Step 5. Callbacks

In [103]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [104]:
filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

In [105]:
ep = EarlyStopping(
    monitor='val_loss', 
    patience=5,
)

## Step 6. Model Fit

In [106]:
epochs=30
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp,ep],
    epochs=epochs
)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.48419, saving model to my_checkpoint.ckpt
Epoch 2/30

Epoch 00002: val_loss improved from 0.48419 to 0.46459, saving model to my_checkpoint.ckpt
Epoch 3/30

Epoch 00003: val_loss improved from 0.46459 to 0.46376, saving model to my_checkpoint.ckpt
Epoch 4/30

Epoch 00004: val_loss improved from 0.46376 to 0.45403, saving model to my_checkpoint.ckpt
Epoch 5/30

Epoch 00005: val_loss improved from 0.45403 to 0.44799, saving model to my_checkpoint.ckpt
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.44799
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.44799
Epoch 8/30

Epoch 00008: val_loss did not improve from 0.44799
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.44799
Epoch 10/30

Epoch 00010: val_loss did not improve from 0.44799


<tensorflow.python.keras.callbacks.History at 0x7fe3c3ec9310>

## Step 7. Model Evaluate & Save

In [107]:
model.load_weights(filepath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe3c088a0d0>

In [108]:
model.evaluate(X_valid_padded, y_valid)



[0.4479884207248688, 0.8023637533187866]

In [109]:
X_valid[0]

[73, 15, 231, 507, 18, 751, 397, 922, 175, 5, 1, 4, 2, 3, 1, 4, 2, 3, 1]

In [110]:
model.save('./model/basic_nlp.h5')

## Step 8. Reload Model

In [111]:
import tensorflow as tf

In [112]:
mymodel = tf.keras.models.load_model('./model/basic_nlp.h5')

In [113]:
mymodel.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 120, 128)          98816     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

## Step 9. Predict Test Data 

In [114]:
X_test = tokenizer.texts_to_sequences(test_df['text'])

In [115]:
X_test_padded = pad_sequences(X_test, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [116]:
y_test_raw = model.predict(X_test_padded)

In [117]:
y_test_raw

array([[0.7287144 ],
       [0.6113465 ],
       [0.86930424],
       ...,
       [0.9640878 ],
       [0.80375636],
       [0.21127743]], dtype=float32)

In [118]:
y_test = list(map(lambda x : 1 if x > 0.5 else 0, y_test_raw))

In [119]:
set(y_test)

{0, 1}

In [120]:
y_test[:5]

[1, 1, 1, 0, 1]

In [121]:
test_df['predict'] = y_test

In [122]:
test_df

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,1


In [123]:
test_df[test_df['predict']==1]

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,We're shaking...It's an earthquake,1
...,...,...,...
3257,10858,The death toll in a #IS-suicide car bombing on...,1
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1


In [124]:
submission = test_df[['id','predict']]

In [125]:
submission

Unnamed: 0,id,predict
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [126]:
submission.columns = ['id', 'target']

In [127]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [128]:
submission.to_csv('./sample_submission.csv', index=False)