In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import nlp_helpers as nh
from lstm_helpers import DialectRNN
import lstm_helpers as lh

### Load preocessed data, using data with stem

In [2]:
data_with_stem_path = '../input/preprocessing-with-stem/preprocessed_data_with_stem.obj'

In [3]:
data_with_stem = nh.load_pickle_file(data_with_stem_path)

In [4]:
data_with_stem.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,"[نهي, نفض, يغر]",IQ
1,1175416117793349632,"[يعن, بشر, وحش, طلب, غرب, حرم, يءم, بدن, ينع, ...",IQ
2,1175450108898565888,"[كلم, خلج]",IQ
3,1175471073770573824,"[لمل, رور, ورح, حلوه]",IQ
4,1175496913145217024,"[وين, غيب, حمد]",IQ


In [5]:
data_with_stem['tweet_length'] = [len(tweet) for tweet in data_with_stem['text']]

In [6]:
data_with_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[نهي, نفض, يغر]",IQ,3
1,1175416117793349632,"[يعن, بشر, وحش, طلب, غرب, حرم, يءم, بدن, ينع, ...",IQ,10
2,1175450108898565888,"[كلم, خلج]",IQ,2
3,1175471073770573824,"[لمل, رور, ورح, حلوه]",IQ,4
4,1175496913145217024,"[وين, غيب, حمد]",IQ,3


In [7]:
len(data_with_stem)

458197

### Delete too short tweets

In [8]:
threshold = 2
data_with_stem = data_with_stem[data_with_stem['tweet_length'] > threshold]

In [9]:
data_with_stem.reset_index(inplace=True, drop=True)

In [10]:
len(data_with_stem)

445111

### Create vocabulary

In [11]:
# first, get all words in this corpus
words = nh.get_corpus_words(data_with_stem['text'])

In [12]:
print('The number of words in our this corpus:', len(words))
print('Sample words')
words[:10]

The number of words in our this corpus: 4512842
Sample words


['نهي', 'نفض', 'يغر', 'يعن', 'بشر', 'وحش', 'طلب', 'غرب', 'حرم', 'يءم']

In [13]:
# vocab: contains unique words only, ordered form most frequent to least frequent
vocab = nh.get_vocab(words)

In [14]:
print('The number of unique words:', len(vocab))
print('Sample of most frequent words')
vocab[:10]

The number of unique words: 131093
Sample of most frequent words


['الل', 'مش', 'ولل', 'عرف', 'وحد', 'كلم', 'ناس', 'شي', 'عمل', 'طلع']

In [15]:
# vocab_to_int: dictionay, where its keys are the vocab words, 
# and its values are integers starting from 1
# 0 will be used later for padding and for unknown words 
# int_to_vocab, keys are integers (starting from 1), values are the words in our vocab
vocab_to_int, int_to_vocab = nh.get_mappings(vocab, i=1)

In [16]:
# save vocab_to_int, and int_to_vocab, will be needed in inference
nh.save_pickle_file(vocab_to_int, 'vocab_to_int_with_stem.obj')
nh.save_pickle_file(int_to_vocab, 'int_to_vocab_with_stem.obj')

### Encode text

In [17]:
data_with_stem['text'] = data_with_stem['text'].apply(nh.encode, args=(vocab_to_int,))

In [18]:
data_with_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[248, 1572, 1052]",IQ,3
1,1175416117793349632,"[17, 225, 587, 88, 96, 40, 7251, 112, 4660, 559]",IQ,10
2,1175471073770573824,"[1471, 1722, 467, 2661]",IQ,4
3,1175496913145217024,"[36, 482, 25]",IQ,3
4,1175668034146643968,"[1282, 559, 390, 12, 792, 36, 11, 556, 45395, ...",IQ,30


We also need to encode labels (`dialect`) to be able to feed them into the DL models

In [19]:
# labels_to_int: dictionary, where its keys are the labels,
# and its values are integers starting from 0
labels = data_with_stem['dialect'].unique()
labels_to_int, int_to_labels = nh.get_mappings(labels, i=0)

In [20]:
# save labels_to_int, and int_to_labels, they will be needed in inference
nh.save_pickle_file(labels_to_int, 'labels_to_int.obj')
nh.save_pickle_file(int_to_labels, 'int_to_labels.obj')

In [21]:
print('Number of labels (classes):', len(labels))
print('Labels:')
print(labels_to_int)

Number of labels (classes): 18
Labels:
{'IQ': 0, 'LY': 1, 'QA': 2, 'PL': 3, 'SY': 4, 'TN': 5, 'JO': 6, 'MA': 7, 'SA': 8, 'YE': 9, 'DZ': 10, 'EG': 11, 'LB': 12, 'KW': 13, 'OM': 14, 'SD': 15, 'AE': 16, 'BH': 17}


In [22]:
# having labels_to_int, we can encode dialects
data_with_stem['dialect'] = data_with_stem['dialect'].map(labels_to_int)

In [23]:
data_with_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[248, 1572, 1052]",0,3
1,1175416117793349632,"[17, 225, 587, 88, 96, 40, 7251, 112, 4660, 559]",0,10
2,1175471073770573824,"[1471, 1722, 467, 2661]",0,4
3,1175496913145217024,"[36, 482, 25]",0,3
4,1175668034146643968,"[1282, 559, 390, 12, 792, 36, 11, 556, 45395, ...",0,30


### Pad tweets


From EDA notebook, we know that about 75% of tweets have less than 13 words

In [24]:
seq_length = 20
features = nh.pad_docs(data_with_stem['text'], seq_length)

In [25]:
features[:3, :]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  248, 1572, 1052],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   17,
         225,  587,   88,   96,   40, 7251,  112, 4660,  559],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 1471, 1722,  467, 2661]])

### Split data into train, valdiation, and test sets

In [26]:
split_fracion = 0.2

## split data into training, validation, and test data (features and labels, x and y)
X = features.astype(np.int64)
y = data_with_stem['dialect'].values

splitted_sets = lh.features_split(X, y, split_fraction=split_fracion)

X_train, y_train = splitted_sets['train']
X_val, y_val = splitted_sets['val']
X_test, y_test = splitted_sets['test']
# print out the shapes of the new data sets
print('\t\t\tFeatures Shapes:')
print('Train set: \t\t{}'.format(X_train.shape),
     '\nValidation set: \t{}'.format(X_val.shape),
     '\nTest set: \t\t{}'.format(X_test.shape))

			Features Shapes:
Train set: 		(356088, 20) 
Validation set: 	(44511, 20) 
Test set: 		(44512, 20)


In [27]:
nh.save_pickle_file(X_test, 'X_test_with_stem_lstm.obj')
nh.save_pickle_file(y_test, 'y_test_with_stem_lstm.obj')

To feed data to pytorch models, we need first ot convert data from numpy arrays to Pytorch tensors.  
After that we can create our datasets

In [28]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 128

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

loaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}

In [29]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')


Training on GPU.


In [30]:
# Instantiate the model with hyperparams
vocab_size = len(vocab_to_int) + 1 # +1 for 0 padding
output_size = len(labels_to_int) # Dialect index
embedding_dim = 400
hidden_dim = 256
n_layers = 2
drop_prob = 0.3

model = DialectRNN(vocab_size, output_size, embedding_dim, hidden_dim, 
                 n_layers, seq_length, drop_prob=drop_prob)
print(model)

DialectRNN(
  (embedding): Embedding(131094, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
  )
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=18, bias=True)
  )
)


In [31]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)
if train_on_gpu:
    class_weights = class_weights.cuda()
print(class_weights)

tensor([1.6439, 0.7020, 0.8124, 0.5842, 1.5641, 2.7362, 0.9202, 2.2054, 0.9472,
        2.5982, 1.5742, 0.4378, 0.9200, 0.6006, 1.3496, 1.7698, 0.9674, 0.9727],
       device='cuda:0')


In [32]:
# loss and optimization functions
lr=0.001

criterion = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=lr)

# training params
n_epochs = 20
print_every = 1
clip=5 # gradient clipping

save_path = 'models'

# setting gamma to 1 cancels scheduler effect
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10], gamma=.2) 

In [33]:
!mkdir models

In [34]:
criteria = 'score'
history = lh.train_lstm(model, n_epochs, optimizer, criterion, scheduler, loaders, 
                        train_on_gpu, save_path, criteria, print_every=print_every, clip=clip)

Start Training on "GPU" for 20 epochs...

Epoch: 1 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 52.73it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 368.69it/s]


Epoch: 1/20... Train Score: 0.251868... Val Score: 0.325629
F1 Score Macro increased form: 0.000000, to: 0.325629	SAVING MODEL...                 in Epoch: 1

Epoch: 2 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.68it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 366.90it/s]


Epoch: 2/20... Train Score: 0.362045... Val Score: 0.350262
F1 Score Macro increased form: 0.325629, to: 0.350262	SAVING MODEL...                 in Epoch: 2

Epoch: 3 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.57it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 369.71it/s]


Epoch: 3/20... Train Score: 0.420026... Val Score: 0.365881
F1 Score Macro increased form: 0.350262, to: 0.365881	SAVING MODEL...                 in Epoch: 3

Epoch: 4 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.68it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 349.32it/s]


Epoch: 4/20... Train Score: 0.470865... Val Score: 0.368832
F1 Score Macro increased form: 0.365881, to: 0.368832	SAVING MODEL...                 in Epoch: 4

Epoch: 5 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.72it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 370.22it/s]


Epoch: 5/20... Train Score: 0.515104... Val Score: 0.367840

Epoch: 6 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.77it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 351.95it/s]


Epoch: 6/20... Train Score: 0.551090... Val Score: 0.369399
F1 Score Macro increased form: 0.368832, to: 0.369399	SAVING MODEL...                 in Epoch: 6

Epoch: 7 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.62it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 354.33it/s]


Epoch: 7/20... Train Score: 0.581769... Val Score: 0.367143

Epoch: 8 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.71it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 357.30it/s]


Epoch: 8/20... Train Score: 0.608381... Val Score: 0.368543

Epoch: 9 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.63it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 368.91it/s]


Epoch: 9/20... Train Score: 0.631546... Val Score: 0.369116

Epoch: 10 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.10it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 369.74it/s]


Epoch: 10/20... Train Score: 0.649756... Val Score: 0.366377

Epoch: 11 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.46it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 369.08it/s]


Epoch: 11/20... Train Score: 0.703282... Val Score: 0.375086
F1 Score Macro increased form: 0.369399, to: 0.375086	SAVING MODEL...                 in Epoch: 11

Epoch: 12 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.26it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 368.65it/s]


Epoch: 12/20... Train Score: 0.731739... Val Score: 0.375885
F1 Score Macro increased form: 0.375086, to: 0.375885	SAVING MODEL...                 in Epoch: 12

Epoch: 13 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.29it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 367.12it/s]


Epoch: 13/20... Train Score: 0.748037... Val Score: 0.374720

Epoch: 14 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.48it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 367.12it/s]


Epoch: 14/20... Train Score: 0.761601... Val Score: 0.374985

Epoch: 15 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.68it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 370.77it/s]


Epoch: 15/20... Train Score: 0.771081... Val Score: 0.371676

Epoch: 16 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.77it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 370.49it/s]


Epoch: 16/20... Train Score: 0.781035... Val Score: 0.372442

Epoch: 17 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.18it/s]


validation...


100%|█████████▉| 347/348 [00:00<00:00, 368.77it/s]


Epoch: 17/20... Train Score: 0.789580... Val Score: 0.370121

Epoch: 18 train...


100%|█████████▉| 2781/2782 [00:51<00:00, 53.49it/s]


validation...


100%|█████████▉| 347/348 [00:01<00:00, 338.04it/s]


Epoch: 18/20... Train Score: 0.796167... Val Score: 0.370530

Epoch: 19 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.47it/s]


validation...


100%|█████████▉| 347/348 [00:01<00:00, 257.98it/s]


Epoch: 19/20... Train Score: 0.802895... Val Score: 0.369429

Epoch: 20 train...


100%|█████████▉| 2781/2782 [00:52<00:00, 53.44it/s]


validation...


100%|█████████▉| 347/348 [00:01<00:00, 321.00it/s]


Epoch: 20/20... Train Score: 0.808420... Val Score: 0.370836


In [35]:
model.load_state_dict(torch.load(save_path+'/best_model.pt'))

<All keys matched successfully>

In [36]:
f1_score_macro, test_loss, test_acc = lh.test(model, loaders['val'], criterion, train_on_gpu)

In [37]:
print('Test f1 score:', f1_score_macro)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

Test f1 score: 0.37678154467961555
Test loss: 3.1449287868714126
Test accuracy: 0.4022601154770731


In [38]:
for i in range(n_epochs):
    print('model_{}'.format(i))
    model.load_state_dict(torch.load(save_path+'/model_{}.pt'.format(i)))    
    f1_score_macro, test_loss, test_acc = lh.test(model, loaders['val'], criterion, train_on_gpu)    
    print('Test f1 score:', f1_score_macro)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)
    print()

model_0
Test f1 score: 0.325490164993027
Test loss: 2.087356202197006
Test accuracy: 0.3500932353800184

model_1
Test f1 score: 0.3505386858496301
Test loss: 2.0005100233754094
Test accuracy: 0.3828716497045674

model_2
Test f1 score: 0.3662423425562194
Test loss: 1.9893130596501682
Test accuracy: 0.3883309743658871

model_3
Test f1 score: 0.36832682039833947
Test loss: 2.053161740646582
Test accuracy: 0.3926220484823976

model_4
Test f1 score: 0.36833239679334034
Test loss: 2.1241801111430187
Test accuracy: 0.3894542921974343

model_5
Test f1 score: 0.3691729637916338
Test loss: 2.1867632134159978
Test accuracy: 0.3929141111185999

model_6
Test f1 score: 0.3663857981692469
Test loss: 2.324898838309802
Test accuracy: 0.3889824987081845

model_7
Test f1 score: 0.3688019036292477
Test loss: 2.4262146437889562
Test accuracy: 0.39525061220821817

model_8
Test f1 score: 0.3696999535016988
Test loss: 2.5376035012841567
Test accuracy: 0.39316124104154027

model_9
Test f1 score: 0.366737331149