### Note: all the models in this notebook did not see this test set before

In [1]:
!pip install daar==0.0.3

Collecting daar==0.0.3
  Downloading daar-0.0.3.tar.gz (7.8 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: daar
  Building wheel for daar (setup.py) ... [?25l- \ done
[?25h  Created wheel for daar: filename=daar-0.0.3-py3-none-any.whl size=10366 sha256=58fcee375f08a204e2d6789e839d98cba51d91fdf539e3f80b5caafcdcb08f4f
  Stored in directory: /root/.cache/pip/wheels/35/26/f4/7568b1957e792ed4441f7ec1df927a266948195d4e2081b027
Successfully built daar
Installing collected packages: daar
Successfully installed daar-0.0.3


In [2]:
import daar.lstm_helpers as lh
import daar.nlp_helpers as nh

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
from lstm_helpers import DialectRNN
from sklearn.metrics import f1_score
import torch.nn as nn

#### NO stem

#### 1- LSTM

In [4]:
X_test_lstm = nh.load_pickle_file('../input/testdataset/X_test_no_stem.obj')
y_test_lstm = nh.load_pickle_file('../input/testdataset/y_test_no_stem.obj')

In [5]:
X_test_lstm

array([[     0,      0,      0, ...,    232,    420,    458],
       [     0,      0,      0, ...,  13520, 178655,   5319],
       [     0,      0,      0, ...,    889,   6586,   6006],
       ...,
       [     0,      0,      0, ...,    154,  63234,   4116],
       [     0,      0,      0, ...,   4399,   8015,   7094],
       [     0,      0,      0, ...,    180,   1524,  37015]])

In [6]:
# create Tensor datasets
test_data = TensorDataset(torch.from_numpy(X_test_lstm), torch.from_numpy(y_test_lstm))

# dataloaders
batch_size = 64

test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [7]:
vocab_to_int = nh.load_pickle_file('../input/vocabtoint/vocab_to_int_no_stem.obj')

In [8]:
# Instantiate the model with hyperparams
vocab_size = len(vocab_to_int) + 1 # +1 for 0 padding
output_size = 18# Dialect index
embedding_dim = 400
hidden_dim = 256
n_layers = 2
drop_prob = 0.3
seq_length = 20


model = DialectRNN(vocab_size, output_size, embedding_dim, hidden_dim, 
                 n_layers, seq_length, drop_prob=drop_prob)
print(model)



DialectRNN(
  (embedding): Embedding(421598, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
  )
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=18, bias=True)
  )
)


In [9]:
model.load_state_dict(torch.load('../input/aim-lstm-model-no-stem/models_1/best_model.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [10]:
train_on_gpu = torch.cuda.is_available()

In [11]:
class_weights = torch.Tensor([1.6439, 0.7020, 0.8124, 0.5842, 1.5641, 2.7362, 0.9202, 2.2054, 0.9472,
        2.5982, 1.5742, 0.4378, 0.9200, 0.6006, 1.3496, 1.7698, 0.9674, 0.9727])

criterion = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
f1_score_macro, test_loss, test_acc = lh.test(model, test_loader, criterion, train_on_gpu)

In [12]:
# track all scores
all_scores = {}

In [13]:
print('LSTM model f1_score_macro on test_data with no stemming is:', f1_score_macro)
all_scores['LSTM No Stem'] = f1_score_macro

LSTM model f1_score_macro on test_data with no stemming is: 0.4140459135041342


#### 2- Ml model

In [14]:
X_test = nh.load_pickle_file('../input/testdataset/X_test_ml_no_stem.obj')
y_test = nh.load_pickle_file('../input/testdataset/y_test_ml_no_stem.obj')

In [15]:
rf_20 = nh.load_pickle_file('../input/nlp-ml-model/pipe_rf_20.obj')

In [16]:
y_hat = rf_20.predict(X_test)

In [17]:
f1_score_macro = f1_score(y_test, y_hat, average='macro')

In [18]:
print('ML model (Random Forest) f1_score_macro on test_data with no stem is:', f1_score_macro)
all_scores['ML No Stem'] = f1_score_macro

ML model (Random Forest) f1_score_macro on test_data with no stem is: 0.3590622808302603


### With stemming

#### 1- Lstm

In [19]:
X_test_lstm = nh.load_pickle_file('../input/testdataset/X_test_with_stem_lstm.obj')
y_test_lstm = nh.load_pickle_file('../input/testdataset/y_test_with_stem_lstm.obj')

# create Tensor datasets
test_data = TensorDataset(torch.from_numpy(X_test_lstm), torch.from_numpy(y_test_lstm))

# dataloaders
batch_size = 64

test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

vocab_to_int = nh.load_pickle_file('../input/vocabtoint/vocab_to_int_with_stem.obj')

vocab_size = len(vocab_to_int) + 1 # +1 for 0 padding
output_size = 18 # Dialect index
embedding_dim = 400
hidden_dim = 256
n_layers = 2
drop_prob = 0.3

seq_length = 20


model = DialectRNN(vocab_size, output_size, embedding_dim, hidden_dim, 
                 n_layers, seq_length, drop_prob=drop_prob)
print(model)


model.load_state_dict(torch.load('../input/aim-lstm-model-stem/models/best_model.pt', map_location=torch.device('cpu')))

DialectRNN(
  (embedding): Embedding(131094, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
  )
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=18, bias=True)
  )
)


<All keys matched successfully>

In [20]:
f1_score_macro, test_loss, test_acc = lh.test(model, test_loader, criterion, train_on_gpu)

In [21]:
all_scores['LSTM With Stem'] = f1_score_macro
print('LSTM Model f1_score_macro on test data with stem is:', f1_score_macro)

LSTM Model f1_score_macro on test data with stem is: 0.3743259140270182


#### 2- ML Model

In [22]:
X_test = nh.load_pickle_file('../input/testdataset/X_test_ml_with_stem.obj')
y_test = nh.load_pickle_file('../input/testdataset/y_test_ml_with_stem.obj')

rf_20 = nh.load_pickle_file('../input/nlp-ml-model-with-stemming/pipe_rf_20_with_stem.obj')

y_hat = rf_20.predict(X_test)
f1_score_macro = f1_score(y_test, y_hat, average='macro')

In [23]:
all_scores['ML With Stem'] = f1_score_macro
print('ML model with stem f1_score_macro on test set is:', f1_score_macro)

ML model with stem f1_score_macro on test set is: 0.3054796313493699


In [24]:
all_scores.keys()

dict_keys(['LSTM No Stem', 'ML No Stem', 'LSTM With Stem', 'ML With Stem'])

In [25]:
scores_df = pd.DataFrame()
scores_df['No Stem'] = [all_scores['LSTM No Stem'], all_scores['ML No Stem']]
scores_df['With Stem'] = [all_scores['LSTM With Stem'], all_scores['ML With Stem']]
scores_df.index = ['LSTM', 'ML']

In [26]:
scores_df.head()

Unnamed: 0,No Stem,With Stem
LSTM,0.414046,0.374326
ML,0.359062,0.30548
