In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from daar.lstm_helpers import DialectRNN
import daar.lstm_helpers as lh
import daar.nlp_helpers as nh

### Load preocessed data

In [2]:
data_no_stem_path = '../input/dialect-processed-data/preprocessed_data_no_stem.obj'
data_with_stem_path = '../input/dialect-processed-data/preprocessed_data_with_stem.obj'

In [3]:
data_no_stem = nh.load_pickle_file(data_no_stem_path)
data_with_stem = nh.load_pickle_file(data_with_stem_path)

In [4]:
data_no_stem.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,"[بالنهايه, ينتفض, يغير]",IQ
1,1175416117793349632,"[يعني, محسوب, البشر, حيونه, ووحشيه, وتطلبون, ا...",IQ
2,1175450108898565888,"[مبين, كلامه, خليجي]",IQ
3,1175471073770573824,"[يسلملي, مرورك, وروحك, الحلوه]",IQ
4,1175496913145217024,"[وين, الغيبه, محمد]",IQ


In [5]:
data_with_stem.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,"[نهي, نفض, يغر]",IQ
1,1175416117793349632,"[يعن, بشر, وحش, طلب, غرب, حرم, يءم, بدن, ينع, ...",IQ
2,1175450108898565888,"[كلم, خلج]",IQ
3,1175471073770573824,"[لمل, رور, ورح, حلوه]",IQ
4,1175496913145217024,"[وين, غيب, حمد]",IQ


In [6]:
data_no_stem['tweet_length'] = [len(tweet) for tweet in data_no_stem['text']]

In [7]:
data_no_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[بالنهايه, ينتفض, يغير]",IQ,3
1,1175416117793349632,"[يعني, محسوب, البشر, حيونه, ووحشيه, وتطلبون, ا...",IQ,12
2,1175450108898565888,"[مبين, كلامه, خليجي]",IQ,3
3,1175471073770573824,"[يسلملي, مرورك, وروحك, الحلوه]",IQ,4
4,1175496913145217024,"[وين, الغيبه, محمد]",IQ,3


In [8]:
len(data_no_stem)

458197

### Delete too short tweets

In [9]:
threshold = 2
data_no_stem = data_no_stem[data_no_stem['tweet_length'] > threshold]

In [10]:
data_no_stem.reset_index(inplace=True, drop=True)

In [11]:
len(data_no_stem)

449784

### Create vocabulary

In [12]:
# first, get all words in this corpus
words = nh.get_corpus_words(data_no_stem['text'])

In [13]:
print('The number of words in our this corpus:', len(words))
print('Sample words')
words[:10]

The number of words in our this corpus: 4764500
Sample words


['بالنهايه',
 'ينتفض',
 'يغير',
 'يعني',
 'محسوب',
 'البشر',
 'حيونه',
 'ووحشيه',
 'وتطلبون',
 'الغرب']

In [14]:
# vocab: contains unique words only, ordered form most frequent to least frequent
vocab = nh.get_vocab(words)

In [15]:
print('The number of unique words:', len(vocab))
print('Sample of most frequent words')
vocab[:10]

The number of unique words: 421597
Sample of most frequent words


['اللي', 'الله', 'مش', 'والله', 'شي', 'يعني', 'الناس', 'عشان', 'مو', 'ده']

In [16]:
# vocab_to_int: dictionay, where its keys are the vocab words, 
# and its values are integers starting from 1
# 0 will be used later for padding and for unknown words 
# int_to_vocab, keys are integers (starting from 1), values are the words in our vocab
vocab_to_int, int_to_vocab = nh.get_mappings(vocab, i=1)

In [17]:
# save vocab_to_int, and int_to_vocab, will be needed in inference
nh.save_pickle_file(vocab_to_int, 'vocab_to_int_no_stem.obj')
nh.save_pickle_file(int_to_vocab, 'int_to_vocab_no_stem.obj')

### Encode text

In [18]:
data_no_stem['text'] = data_no_stem['text'].apply(nh.encode, args=(vocab_to_int,))

In [19]:
data_no_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[3145, 3059, 2155]",IQ,3
1,1175416117793349632,"[6, 9276, 750, 41217, 126103, 71597, 3163, 126...",IQ,12
2,1175450108898565888,"[1857, 867, 3826]",IQ,3
3,1175471073770573824,"[8553, 20020, 13756, 515]",IQ,4
4,1175496913145217024,"[16, 5936, 75]",IQ,3


We also need to encode labels (`dialect`) to be able to feed them into the DL models

In [20]:
# labels_to_int: dictionary, where its keys are the labels,
# and its values are integers starting from 0
labels = data_no_stem['dialect'].unique()
labels_to_int, int_to_labels = nh.get_mappings(labels, i=0)

In [21]:
# save labels_to_int, and int_to_labels, they will be needed in inference
nh.save_pickle_file(labels_to_int, 'labels_to_int.obj')
nh.save_pickle_file(int_to_labels, 'int_to_labels.obj')

In [22]:
print('Number of labels (classes):', len(labels))
print('Labels:')
print(labels_to_int)

Number of labels (classes): 18
Labels:
{'IQ': 0, 'LY': 1, 'QA': 2, 'PL': 3, 'SY': 4, 'TN': 5, 'JO': 6, 'MA': 7, 'SA': 8, 'YE': 9, 'DZ': 10, 'EG': 11, 'LB': 12, 'KW': 13, 'OM': 14, 'SD': 15, 'AE': 16, 'BH': 17}


In [23]:
# having labels_to_int, we can encode dialects
data_no_stem['dialect'] = data_no_stem['dialect'].map(labels_to_int)

In [24]:
data_no_stem.head()

Unnamed: 0,id,text,dialect,tweet_length
0,1175358310087892992,"[3145, 3059, 2155]",0,3
1,1175416117793349632,"[6, 9276, 750, 41217, 126103, 71597, 3163, 126...",0,12
2,1175450108898565888,"[1857, 867, 3826]",0,3
3,1175471073770573824,"[8553, 20020, 13756, 515]",0,4
4,1175496913145217024,"[16, 5936, 75]",0,3


### Pad tweets


From EDA notebook, we know that about 75% of tweets have less than 13 words

In [25]:
seq_length = 20
features = nh.pad_docs(data_no_stem['text'], seq_length)

In [26]:
features[:3, :]

array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,   3145,   3059,   2155],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             6,   9276,    750,  41217, 126103,  71597,   3163, 126104,
        126105, 126106, 182173,  29687],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,   1857,    867,   3826]])

### Split data into train, valdiation, and test sets

In [27]:
split_fracion = 0.2

## split data into training, validation, and test data (features and labels, x and y)
X = features.astype(np.int64)
y = data_no_stem['dialect'].values

splitted_sets = lh.features_split(X, y, split_fraction=split_fracion)

X_train, y_train = splitted_sets['train']
X_val, y_val = splitted_sets['val']
X_test, y_test = splitted_sets['test']
# print out the shapes of the new data sets
print('\t\t\tFeatures Shapes:')
print('Train set: \t\t{}'.format(X_train.shape),
     '\nValidation set: \t{}'.format(X_val.shape),
     '\nTest set: \t\t{}'.format(X_test.shape))

			Features Shapes:
Train set: 		(359827, 20) 
Validation set: 	(44978, 20) 
Test set: 		(44979, 20)


To feed data to pytorch models, we need first ot convert data from numpy arrays to Pytorch tensors.  
After that we can create our datasets

In [28]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 128

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

loaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}

In [29]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')


Training on GPU.


In [30]:
# Instantiate the model with hyperparams
vocab_size = len(vocab_to_int) + 1 # +1 for 0 padding
output_size = len(labels_to_int) # Dialect index
embedding_dim = 400
hidden_dim = 256
n_layers = 2
drop_prob = 0.5

model = DialectRNN(vocab_size, output_size, embedding_dim, hidden_dim, 
                 n_layers, seq_length, drop_prob=drop_prob)
print(model)

DialectRNN(
  (embedding): Embedding(421598, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
  )
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=18, bias=True)
  )
)


In [31]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)
if train_on_gpu:
    class_weights = class_weights.cuda()
print(class_weights)

tensor([1.6398, 0.7018, 0.8135, 0.5836, 1.5655, 2.7471, 0.9180, 2.2084, 0.9466,
        2.5857, 1.5773, 0.4392, 0.9218, 0.6015, 1.3411, 1.7727, 0.9665, 0.9694],
       device='cuda:0')


In [32]:
# loss and optimization functions
lr=0.001

criterion = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=lr)

# training params
n_epochs = 20
print_every = 1
clip=5 # gradient clipping

save_path = 'models_1'

# setting gamma to 1 cancels scheduler effect
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10], gamma=.2) 

In [33]:
!mkdir models_1

In [34]:
criteria = 'score'
history = lh.train_lstm(model, n_epochs, optimizer, criterion, scheduler, loaders, 
                        train_on_gpu, save_path, criteria, print_every=print_every, clip=clip)

Start Training on "GPU" for 20 epochs...

Epoch: 1 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.26it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 364.29it/s]


Epoch: 1/20... Train Score: 0.235492... Val Score: 0.318113
F1 Score Macro increased form: 0.000000, to: 0.318113	SAVING MODEL...                 in Epoch: 1

Epoch: 2 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.37it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 363.74it/s]


Epoch: 2/20... Train Score: 0.372142... Val Score: 0.372882
F1 Score Macro increased form: 0.318113, to: 0.372882	SAVING MODEL...                 in Epoch: 2

Epoch: 3 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.40it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 368.47it/s]


Epoch: 3/20... Train Score: 0.462164... Val Score: 0.392576
F1 Score Macro increased form: 0.372882, to: 0.392576	SAVING MODEL...                 in Epoch: 3

Epoch: 4 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.37it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 368.49it/s]


Epoch: 4/20... Train Score: 0.538008... Val Score: 0.396026
F1 Score Macro increased form: 0.392576, to: 0.396026	SAVING MODEL...                 in Epoch: 4

Epoch: 5 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.53it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 369.35it/s]


Epoch: 5/20... Train Score: 0.602850... Val Score: 0.404750
F1 Score Macro increased form: 0.396026, to: 0.404750	SAVING MODEL...                 in Epoch: 5

Epoch: 6 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.48it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 369.62it/s]


Epoch: 6/20... Train Score: 0.656309... Val Score: 0.402735

Epoch: 7 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.45it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 368.30it/s]


Epoch: 7/20... Train Score: 0.698755... Val Score: 0.405653
F1 Score Macro increased form: 0.404750, to: 0.405653	SAVING MODEL...                 in Epoch: 7

Epoch: 8 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.51it/s]


validation...


100%|█████████▉| 351/352 [00:01<00:00, 327.49it/s]


Epoch: 8/20... Train Score: 0.734756... Val Score: 0.406343
F1 Score Macro increased form: 0.405653, to: 0.406343	SAVING MODEL...                 in Epoch: 8

Epoch: 9 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.39it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 366.28it/s]


Epoch: 9/20... Train Score: 0.763423... Val Score: 0.403686

Epoch: 10 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.32it/s]


validation...


100%|█████████▉| 351/352 [00:01<00:00, 334.90it/s]


Epoch: 10/20... Train Score: 0.786251... Val Score: 0.408328
F1 Score Macro increased form: 0.406343, to: 0.408328	SAVING MODEL...                 in Epoch: 10

Epoch: 11 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.35it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 364.65it/s]


Epoch: 11/20... Train Score: 0.830114... Val Score: 0.410324
F1 Score Macro increased form: 0.408328, to: 0.410324	SAVING MODEL...                 in Epoch: 11

Epoch: 12 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.42it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 364.26it/s]


Epoch: 12/20... Train Score: 0.852007... Val Score: 0.412267
F1 Score Macro increased form: 0.410324, to: 0.412267	SAVING MODEL...                 in Epoch: 12

Epoch: 13 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.39it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 367.52it/s]


Epoch: 13/20... Train Score: 0.865880... Val Score: 0.411419

Epoch: 14 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.44it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 362.80it/s]


Epoch: 14/20... Train Score: 0.874875... Val Score: 0.410301

Epoch: 15 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.46it/s]


validation...


100%|█████████▉| 351/352 [00:01<00:00, 289.75it/s]


Epoch: 15/20... Train Score: 0.883516... Val Score: 0.410246

Epoch: 16 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.40it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 370.45it/s]


Epoch: 16/20... Train Score: 0.890637... Val Score: 0.410304

Epoch: 17 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.33it/s]


validation...


100%|█████████▉| 351/352 [00:01<00:00, 324.79it/s]


Epoch: 17/20... Train Score: 0.896973... Val Score: 0.409211

Epoch: 18 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.42it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 370.31it/s]


Epoch: 18/20... Train Score: 0.902265... Val Score: 0.408695

Epoch: 19 train...


100%|█████████▉| 2811/2812 [01:54<00:00, 24.48it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 368.79it/s]


Epoch: 19/20... Train Score: 0.907760... Val Score: 0.410385

Epoch: 20 train...


100%|█████████▉| 2811/2812 [01:55<00:00, 24.42it/s]


validation...


100%|█████████▉| 351/352 [00:00<00:00, 364.85it/s]


Epoch: 20/20... Train Score: 0.911480... Val Score: 0.409581


In [35]:
model.load_state_dict(torch.load(save_path+'/best_model.pt'))

<All keys matched successfully>

In [36]:
f1_score_macro, test_loss, test_acc = lh.test(model, loaders['val'], criterion, train_on_gpu)

In [37]:
print('Test f1 score:', f1_score_macro)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

Test f1 score: 0.4124291296617621
Test loss: 3.7933047279673087
Test accuracy: 0.4351905375961581


In [38]:
for i in range(n_epochs):
    print('model_{}'.format(i))
    model.load_state_dict(torch.load(save_path+'/model_{}.pt'.format(i)))    
    f1_score_macro, test_loss, test_acc = lh.test(model, loaders['val'], criterion, train_on_gpu)    
    print('Test f1 score:', f1_score_macro)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)
    print()

model_0
Test f1 score: 0.31773888234889386
Test loss: 2.059247499177938
Test accuracy: 0.3461470052025435

model_1
Test f1 score: 0.37332647522820517
Test loss: 1.9321239646683392
Test accuracy: 0.3869891947174174

model_2
Test f1 score: 0.3923074320931676
Test loss: 1.937300558103795
Test accuracy: 0.41071190359731424

model_3
Test f1 score: 0.3963719691627052
Test loss: 2.011364690598599
Test accuracy: 0.4169594023744942

model_4
Test f1 score: 0.40424982774546525
Test loss: 2.152640054368565
Test accuracy: 0.42300680332607055

model_5
Test f1 score: 0.40305446106097176
Test loss: 2.296897302665602
Test accuracy: 0.4227400062252657

model_6
Test f1 score: 0.4052831413422946
Test loss: 2.4432626806433047
Test accuracy: 0.42516341322424295

model_7
Test f1 score: 0.407615428550589
Test loss: 2.6647775753950462
Test accuracy: 0.42832051225043355

model_8
Test f1 score: 0.40401881594977834
Test loss: 2.82119312619212
Test accuracy: 0.4246965182978345

model_9
Test f1 score: 0.40731351349