# Project 1 BiLSTM-based NER

print out current `torch` and `torchtext` version.

In [1]:
import torch
import torchtext
#make it reproducible, use fixed random seed
torch.manual_seed(42)
print(torch.__version__)
print(torchtext.__version__)

2.3.1+cu121
0.18.0+cpu


Since this course doesn't offer any GPU resource, so I buy some points on AutoDL... Check my Plan first.

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


## Data

In [3]:
from torch.utils.data import Dataset

class CHisIECDataset(Dataset):
    label_label_id_mapping = {
        "O": 0,
        "B-PER": 1,
        "I-PER": 2,
        "E-PER": 3,
        "S-PER": 4,
        "B-LOC": 5,
        "I-LOC": 6,
        "E-LOC": 7,
        "S-LOC": 8,
        "B-OFI": 9,
        "I-OFI": 10,
        "E-OFI": 11,
        "S-OFI": 12,
        "B-BOOK": 13,
        "I-BOOK": 14,
        "E-BOOK": 15,
        "S-BOOK": 16,
    }

    def __init__(self, path) -> None:
        super().__init__()
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            d = [[], []]
            while line := f.readline():
                line = line.strip()
                if line:
                    word, label = line.split()
                    d[0].append(word)
                    d[1].append(self.label_label_id_mapping[label])
                elif d[0]:
                    self.data.append(tuple(d))
                    d = [[], []]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [4]:
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot

train_set = CHisIECDataset("./CHisIEC/train.txt")
dev_set = CHisIECDataset("./CHisIEC/dev.txt")
test_set = CHisIECDataset("./CHisIEC/test.txt")

## Model

Below Cell defines the default network structure from the TAs, the network structure consists of

Embedding->Bidirectional LSTM -> MLP Classifier.

In [6]:
import torchtext

torchtext.disable_torchtext_deprecation_warning()
from torch import nn
from torchtext.vocab import Vectors


from torch.nn import LSTM
import torch


class MyAwesomeModel(nn.Module):

    def __init__(self, embed_dim=50, hidden_dim=50) -> None:
        super().__init__()
        self.vectors = Vectors(
            name="gigaword_chn.all.a2b.uni.ite50.vec",
            cache=".",
        )
        self.lstm = LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )
        self.classifier = nn.Linear(hidden_dim * 2, 17)

    def forward(self, x: str):
        x = self.vectors.get_vecs_by_tokens(x).to("cuda")
        x, _ = self.lstm(x.unsqueeze(0))
        x = self.classifier(x[0])
        return x

# Example usage
model = MyAwesomeModel().cuda()
tokens = ["hello", "world"]
logits = model(tokens)
print(logits.shape)

torch.Size([2, 17])


We will test another model that add attention after the BiLSTM.

In [7]:
import torch
from torch import nn
from torchtext.vocab import Vectors
from torch.nn import LSTM

class MyAwesomeModel_withAttention(nn.Module):

    def __init__(self, embed_dim=50, hidden_dim=50,attention_heads=5,num_classes=17) -> None:
        super().__init__()

        # Load pre-trained vectors
        self.vectors = Vectors(
            name="gigaword_chn.all.a2b.uni.ite50.vec",
            cache=".",
        )

        # Bidirectional LSTM
        self.lstm = LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )

        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=attention_heads, batch_first=True)

        # Classifier
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)

        # Move model to GPU if available
        self.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    def forward(self, tokens: list):
        """
        Forward pass.
        tokens: A list of tokenized words (strings).
        """
        # Get word embeddings for the tokens
        embeddings = self.vectors.get_vecs_by_tokens(tokens)  # shape: (seq_len, embed_dim)

        # Move embeddings to the correct device (e.g., 'cuda')
        embeddings = embeddings.to(self.classifier.weight.device)  # Use the same device as the classifier

        # Add batch dimension (batch_size=1 for a single sequence)
        embeddings = embeddings.unsqueeze(0)  # shape: (1, seq_len, embed_dim)

        lstm_out, _ = self.lstm(embeddings)  # Output: (batch_size, num_of_tokens, hidden_size * 2)

        # Apply self-attention
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)  # Shape stays the same: (batch_size, num_of_tokens, hidden_size * 2)

        # Token-wise classification
        logits = self.classifier(attn_out)  # Output: (batch_size, num_of_tokens, num_classes)

        return logits[0]

# Example usage
model = MyAwesomeModel_withAttention().cuda()
tokens = ["hello", "world"]
logits = model(tokens)
print(logits.shape)

torch.Size([2, 17])


## Training

In order to simplify the notebook structure, pack up all training and testing process into a single function.

In [8]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import trange

def train(model,metric_fn,optimizer,loader):
    model.train()
    epoch_loss = []
    for x, y in loader:
        optimizer.zero_grad()
        pred = model(x)
        label = y.to("cuda")
        try:
            loss = metric_fn(pred, label)
        except:
            print(pred.shape, label.shape)
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    return {"loss": sum(epoch_loss) / len(epoch_loss)}

def eval(model,loader):
    model.eval()
    pred = []
    target = []
    for x, y in loader:
        _pred = model(x).argmax(-1)
        pred += _pred.tolist()
        _target = y.argmax(-1)
        target += _target.tolist()
    return {
        "accuracy": accuracy_score(target, pred),
        "f1_macro": f1_score(target, pred, average="macro"),
    }

def training_and_testing(model,metric_fn,optimizer,epochs,train_loader,val_loader):
    metrics_list = list()
    for epoch in trange(epochs, desc="Epoch"):
        metrics = train(model,metric_fn,optimizer,loader=train_loader)
        with torch.no_grad():
            metrics = {**eval(model,loader=val_loader), **metrics}
            print(metrics)
            metrics_list.append(metrics)
    return metrics_list

Training on default models. According to the [paper](https://proceedings.neurips.cc/paper_files/paper/2019/hash/dc6a70712a252123c40d2adba6a11d84-Abstract.html), the ratio of the learning rate and batch size is larger, the better the generalization, we make this principle our guidance in this project(I make a silly error and we need to re-train this model again, luckily we fixed the random seed).

In [9]:
import itertools
from sklearn.model_selection import ParameterGrid

param_grid = {
    'batch_size': [1,2,4],
    'lr': [1e-4,1e-3,1e-2],
}

## previous searched best params 'epochs': 10, 'hidden_size': 256, and then my kernel crash...
epochs = 10
hidden_size = 256

# Generate Parameters Combination
param_combinations = list(ParameterGrid(param_grid))

# Record Best HyperParameters
best_f1 = float('-inf')
best_params = None
best_model = None

# Open a file to write the output
with open('BiLSTM_training_output.txt', 'w') as f:
    # Redirect stdout to the file
    import sys
    original_stdout = sys.stdout
    sys.stdout = f

    # Iterate on all Parameter combination
    for params in param_combinations:
        print(f"Training with params: {params}")

        def get_dataloader(dataset, shuffle=True):
            def collect_fn(batch):
                t = batch[0][0]
                l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
                return t, l

            return DataLoader(
                dataset,
                shuffle=shuffle,
                batch_size=params['batch_size'],
                collate_fn=collect_fn,
            )

        train_loader = get_dataloader(train_set)
        val_loader = get_dataloader(dev_set, shuffle=False)

        # Initialize Model Every Time!!
        model = MyAwesomeModel(hidden_dim=hidden_size).cuda()

        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])

        # Train and Eval Models
        metrics_list = training_and_testing(model, metric_fn=torch.nn.CrossEntropyLoss(), optimizer=optimizer, epochs=epochs,train_loader=train_loader,val_loader=val_loader)

        # Get Final Results
        final_metrics = metrics_list[-1]
        f1_macro = final_metrics['f1_macro']

        # Updating Params
        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_params = params
            best_model = model

        print(f"Current Best F1 Macro: {best_f1}")
        print(f"Best Params so far: {best_params}")

    print(f"Best overall F1 Macro: {best_f1}")
    print(f"Best Params: {best_params}")

    # Restore stdout
    sys.stdout = original_stdout

Epoch: 100%|██████████| 10/10 [03:04<00:00, 18.49s/it]
Epoch: 100%|██████████| 10/10 [03:07<00:00, 18.72s/it]
Epoch: 100%|██████████| 10/10 [03:07<00:00, 18.73s/it]
Epoch: 100%|██████████| 10/10 [01:34<00:00,  9.41s/it]
Epoch: 100%|██████████| 10/10 [01:33<00:00,  9.35s/it]
Epoch: 100%|██████████| 10/10 [01:32<00:00,  9.26s/it]
Epoch: 100%|██████████| 10/10 [00:46<00:00,  4.70s/it]
Epoch: 100%|██████████| 10/10 [00:45<00:00,  4.59s/it]
Epoch: 100%|██████████| 10/10 [00:46<00:00,  4.65s/it]


In [10]:
print(f"Best overall F1 Macro: {best_f1}")
print(f"Best Params: {best_params}")

Best overall F1 Macro: 0.6349876246115874
Best Params: {'batch_size': 1, 'lr': 0.001}


At least we pass the criterion for getting 60 points in this project(the whole training process is in the output txt file), Congrats to myself! Let's check the parameter grid: 
|      | 1e-2   | 1e-3   | 1e-4   |
| ---- | ------ | ------ | ------ |
| 1    | 0.4418 | 0.6350 | 0.6113 |
| 2    | 0.4042 | 0.6007 | 0.5583 |
| 4    | 0.4371 | 0.5953 | 0.4348 |

It's seem like our guiding paper(which might be a little bit old, on 2019 NIPS) was right, the higher the ratio, the better the generalization performance(within same learning rate). And the learning rate between 1e-3 and 1e-4 differs a little when in small batch but dramatic in larger batch size, we think it might come from the error accumulation in large batch.

Training the model with Multihead-Attention after BiLSTM layer. We shall use Multihead-Attention not Attention alone(or older textbook will refer it to something called **Global Alignment Weight**) since we need token-wise classification, Attention alone will returns the whole representation of the sentence(which is better for sentimental/title classification).

As a newcomer in NLP and DL, I fall in the trap of using Attention alone at first and being frustrating about the output size...

**Notice**: In this time we only grid search the attention heads and learning rate, and this time we will change params more precisely.

In [10]:
import itertools
from sklearn.model_selection import ParameterGrid

## previous searched best params 'epochs': 10, 'hidden_size': 256, and then my kernel crash...
epochs = 10
batch_size = 1
hidden_size = 256

## hiden size is 256, attention heads should be divisible!!!
param_grid = {
    'attention_heads': [2,4,8,16,32,64,128],
    'lr': [1e-3,1.5 * 1e-3,8.5 * 1e-2],
}

# Generate Parameters Combination
param_combinations = list(ParameterGrid(param_grid))

# Record Best HyperParameters
best_f1 = float('-inf')
best_params = None
best_model = None

# Open a file to write the output
with open('Multihead_attention_training_output.txt', 'w') as f:
    # Redirect stdout to the file
    import sys
    original_stdout = sys.stdout
    sys.stdout = f

    # Iterate on all Parameter combination
    for params in param_combinations:
        print(f"Training with params: {params}")

        def get_dataloader(dataset, shuffle=True):
            def collect_fn(batch):
                t = batch[0][0]
                l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
                return t, l

            return DataLoader(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                collate_fn=collect_fn,
            )

        train_loader = get_dataloader(train_set)
        val_loader = get_dataloader(dev_set, shuffle=False)

        # Initialize Model Every Time!!
        model = MyAwesomeModel_withAttention(hidden_dim=hidden_size,attention_heads=params['attention_heads']).cuda()

        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])

        # Train and Eval Models
        metrics_list = training_and_testing(model, metric_fn=torch.nn.CrossEntropyLoss(), optimizer=optimizer, epochs=epochs,train_loader=train_loader,val_loader=val_loader)

        # Get Final Results
        final_metrics = metrics_list[-1]
        f1_macro = final_metrics['f1_macro']

        # Updating Params
        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_params = params
            best_model = model

        print(f"Current Best F1 Macro: {best_f1}")
        print(f"Best Params so far: {best_params}")

    print(f"Best overall F1 Macro: {best_f1}")
    print(f"Best Params: {best_params}")

    # Restore stdout
    sys.stdout = original_stdout

Epoch: 100%|██████████| 10/10 [03:43<00:00, 22.37s/it]
Epoch: 100%|██████████| 10/10 [03:33<00:00, 21.34s/it]
Epoch: 100%|██████████| 10/10 [03:33<00:00, 21.36s/it]
Epoch: 100%|██████████| 10/10 [03:43<00:00, 22.34s/it]
Epoch: 100%|██████████| 10/10 [03:29<00:00, 20.91s/it]
Epoch: 100%|██████████| 10/10 [03:37<00:00, 21.75s/it]
Epoch: 100%|██████████| 10/10 [03:39<00:00, 21.94s/it]
Epoch: 100%|██████████| 10/10 [03:41<00:00, 22.12s/it]
Epoch: 100%|██████████| 10/10 [03:26<00:00, 20.67s/it]
Epoch: 100%|██████████| 10/10 [03:39<00:00, 21.99s/it]
Epoch: 100%|██████████| 10/10 [03:35<00:00, 21.50s/it]
Epoch: 100%|██████████| 10/10 [03:31<00:00, 21.14s/it]
Epoch: 100%|██████████| 10/10 [03:42<00:00, 22.24s/it]
Epoch: 100%|██████████| 10/10 [03:35<00:00, 21.57s/it]
Epoch: 100%|██████████| 10/10 [03:33<00:00, 21.39s/it]
Epoch: 100%|██████████| 10/10 [03:43<00:00, 22.35s/it]
Epoch: 100%|██████████| 10/10 [03:35<00:00, 21.52s/it]
Epoch: 100%|██████████| 10/10 [03:40<00:00, 22.07s/it]
Epoch: 100

In [11]:
print(f"Best overall F1 Macro: {best_f1}")
print(f"Best Params: {best_params}")

Best overall F1 Macro: 0.5922000319627826
Best Params: {'attention_heads': 128, 'lr': 0.001}


It seems like we need more attention head, let's add some more(Notice that in PyTorch's implementation, attention head should be divisible to the actual input hidden dimension).

In [12]:
import itertools
from sklearn.model_selection import ParameterGrid

## previous searched best params 'epochs': 10, 'hidden_size': 256, and then my kernel crash...
epochs = 10
batch_size = 1
hidden_size = 256

## hiden size is 256, attention heads should be divisible!!!
param_grid = {
    'attention_heads': [256,512],
    'lr': [1e-3,1.5 * 1e-3,8.5 * 1e-2],
}

# Generate Parameters Combination
param_combinations = list(ParameterGrid(param_grid))

# Record Best HyperParameters
best_f1 = float('-inf')
best_params = None
best_model = None

# Open a file to write the output
with open('Multihead_attention_training_output.txt', 'w') as f:
    # Redirect stdout to the file
    import sys
    original_stdout = sys.stdout
    sys.stdout = f

    # Iterate on all Parameter combination
    for params in param_combinations:
        print(f"Training with params: {params}")

        def get_dataloader(dataset, shuffle=True):
            def collect_fn(batch):
                t = batch[0][0]
                l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
                return t, l

            return DataLoader(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                collate_fn=collect_fn,
            )

        train_loader = get_dataloader(train_set)
        val_loader = get_dataloader(dev_set, shuffle=False)

        # Initialize Model Every Time!!
        model = MyAwesomeModel_withAttention(hidden_dim=hidden_size,attention_heads=params['attention_heads']).cuda()

        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])

        # Train and Eval Models
        metrics_list = training_and_testing(model, metric_fn=torch.nn.CrossEntropyLoss(), optimizer=optimizer, epochs=epochs,train_loader=train_loader,val_loader=val_loader)

        # Get Final Results
        final_metrics = metrics_list[-1]
        f1_macro = final_metrics['f1_macro']

        # Updating Params
        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_params = params
            best_model = model

        print(f"Current Best F1 Macro: {best_f1}")
        print(f"Best Params so far: {best_params}")

    print(f"Best overall F1 Macro: {best_f1}")
    print(f"Best Params: {best_params}")

    # Restore stdout
    sys.stdout = original_stdout

Epoch: 100%|██████████| 10/10 [03:35<00:00, 21.50s/it]
Epoch: 100%|██████████| 10/10 [03:39<00:00, 21.98s/it]
Epoch: 100%|██████████| 10/10 [03:34<00:00, 21.49s/it]
Epoch: 100%|██████████| 10/10 [03:35<00:00, 21.59s/it]
Epoch: 100%|██████████| 10/10 [03:43<00:00, 22.37s/it]
Epoch: 100%|██████████| 10/10 [03:46<00:00, 22.63s/it]


In [13]:
print(f"Best overall F1 Macro: {best_f1}")
print(f"Best Params: {best_params}")

Best overall F1 Macro: 0.6322233039655257
Best Params: {'attention_heads': 512, 'lr': 0.001}


Fixed learning rate to be 1e-3, we find the F1-marcos relations with head number to be(very sad due to code implementation trap, I overwrite all output from 2-128):

| Head Number      | 2   | 4   | 8  | 16 | 32 | 64 | 128| 256 | 512 |
| ---- | ------ | ------ | ------ | ----- | ----- | ----- | ----- | ----- | ----- |
| F1-Marcos    | 0.5486 | 0.5139 | 0.5892 |0.5554 | 0.5645 | 0.5855|0.5922 |0.5997|0.6322|

It seems like the higher the head number, the better the F1-marcos we will get, intuitively, the more head number captures more last output layer representation temporally(as we input sequences).

## Model Training Tricks Effects on F1-Marco

Within the best params we find in previous section, we will try some classical tricks and see whether it will impove F1-Marcos. The first technique we try is the dropout method, in this project we mainly focus on BiLSTM-based structure, so we will fix some params we find best in previous grid search.

In [14]:
import torchtext.vocab as vocab
import torch.nn as nn

class MyAwesomeModel_withDropout(nn.Module):

    def __init__(self, embed_dim=50, hidden_dim=256, dropout_rate=0.1) -> None:
        super().__init__()
        self.vectors = vocab.Vectors(
            name="gigaword_chn.all.a2b.uni.ite50.vec",
            cache=".",
        )
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_dim * 2, 17)

    def forward(self, x: str):
        x = self.vectors.get_vecs_by_tokens(x).to("cuda")
        x, _ = self.lstm(x.unsqueeze(0))
        x = self.dropout(x)
        x = self.classifier(x[0])
        return x

# Example usage
model = MyAwesomeModel_withDropout().to("cuda")
tokens = ["hello", "world"]
logits = model(tokens)
print(logits.shape)  # Output: (17,)

torch.Size([2, 17])


In [15]:
batch_size = 1
lr = 0.001
epoch = 10

def get_dataloader(dataset, shuffle=True):
    def collect_fn(batch):
        t = batch[0][0]
        l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
        return t, l

    return DataLoader(
        dataset,
        shuffle=shuffle,
        batch_size=batch_size,
        collate_fn=collect_fn,
    )

train_loader = get_dataloader(train_set)
val_loader = get_dataloader(dev_set, shuffle=False)

# Initialize Model Every Time!!
model = MyAwesomeModel_withDropout().cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Train and Eval Models
metrics_list = training_and_testing(model, metric_fn=torch.nn.CrossEntropyLoss(), optimizer=optimizer, epochs=epochs,train_loader=train_loader,val_loader=val_loader)

Epoch:  10%|█         | 1/10 [00:19<02:54, 19.36s/it]

{'accuracy': 0.8847151218789361, 'f1_macro': 0.5095718896149861, 'loss': 0.595092751166882}


Epoch:  20%|██        | 2/10 [00:38<02:35, 19.45s/it]

{'accuracy': 0.8978291472179003, 'f1_macro': 0.5826376577747796, 'loss': 0.30117418805195983}


Epoch:  30%|███       | 3/10 [00:58<02:16, 19.52s/it]

{'accuracy': 0.9030895754612136, 'f1_macro': 0.6269906174668747, 'loss': 0.19553928284380148}


Epoch:  40%|████      | 4/10 [01:17<01:56, 19.45s/it]

{'accuracy': 0.9102763577091205, 'f1_macro': 0.6426782327286192, 'loss': 0.13272587810876516}


Epoch:  50%|█████     | 5/10 [01:37<01:37, 19.44s/it]

{'accuracy': 0.9070904645476773, 'f1_macro': 0.6561510335519928, 'loss': 0.09386270173655538}


Epoch:  60%|██████    | 6/10 [01:56<01:17, 19.45s/it]

{'accuracy': 0.9082759131658887, 'f1_macro': 0.643973014011336, 'loss': 0.07295414712295045}


Epoch:  70%|███████   | 7/10 [02:15<00:57, 19.20s/it]

{'accuracy': 0.9073868267022301, 'f1_macro': 0.610762705419249, 'loss': 0.05653008915432082}


Epoch:  80%|████████  | 8/10 [02:34<00:38, 19.13s/it]

{'accuracy': 0.9042009335407868, 'f1_macro': 0.6242944054492563, 'loss': 0.04921140797597976}


Epoch:  90%|█████████ | 9/10 [02:53<00:18, 18.99s/it]

{'accuracy': 0.9096836334000148, 'f1_macro': 0.6284413694431114, 'loss': 0.04345034519752923}


Epoch: 100%|██████████| 10/10 [03:12<00:00, 19.23s/it]

{'accuracy': 0.9061272875453804, 'f1_macro': 0.626151644183956, 'loss': 0.037155030771844705}





In [16]:
print("The final F1 scores is:",metrics_list[-1]["f1_macro"])

The final F1 scores is: 0.626151644183956


It seems like the using dropout technique doesn't sota the previous model with best hyperparmeters, but if we see the training trace we will find that there is some epoch, the F1-score is significant higher than baseline. A proper explaination for this is that 0-encoded component is too many in the dataset, hence in optimization, it will be more likely to classify it as right(which reduce the loss but lower the F1-scores).

## Some Theoretical Analysis

One-hot encoding is crucial for this task, but it introduces a class imbalance problem with numerous zero labels. This can bias the optimizer towards predicting more zeros. 

While Non-sequence models like CNN might can improve performance, but they might not fully capture the syntactic nuances required for classification. They focus on identifying words likely to belong to specific parts but neglect sentence-level structural information.

An idea that I have no time to try it:
If we use tf-idf like method to calculate the different components(17 in whole) and use the frequency to encode different part, maybe we can reduce the tendency of making all prediction to zero(?)

## Conclusion

In this project we do:
* an BiLSTM-based NER classifier
* BiLSTM+Multi-head Attention NER classifier
* dropout Training technique was applied to the BiLSTM-based NER classifier
  
and we find that:
* among different hidden dimension, learning rate, training epoch numbers and batch size, the best params was 256, 1e-3, 10 and 1 respectively and we will get best performance of F1-scores to be 0.6349.
* when adding multi-head attention, with about 512 heads, we reach best performace of 0.6322.
* dropout with 10% chance make the F1-scores to 0.6261.

and we thinks that some problem can be tackled by:
* using better trained embedding vectors
* finding proper encoding methods other than one hot.