# ***Import libraries***

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
import fnmatch
from IPython.display import Audio
from IPython.display import display
from matplotlib import pyplot as pl
import os
import random
import numpy as np
import nltk 
import glob
import pandas as pd
from nltk.corpus import stopwords
import re
from sklearn import preprocessing
from matplotlib import pyplot as plt
from torch.nn.utils.rnn import pad_sequence
import argparse

# ***Select the Cuda Device***

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # using cuda device to procss the data fastly  
device

device(type='cuda')

# ***Read the languges Files***

In [None]:
txt_files_list = glob.glob("language_data/*.txt") # folder that contain all txt  files  
language = 0
df = pd.DataFrame(columns=['sentences','languages']) # create dataframe that hold all data
sentences = [] # create the list that hold all sentences in all documents 
languages = [] # create the list that hold the languages with the same index of sentnce 
for filename in txt_files_list:
    with open(filename, 'r' , errors="ignore") as f:
        Lines = f.readlines()
        for line in Lines:
            line = line.replace("\n","") # clean sentence from document next line symbole
            line = re.sub(r'[0-9]+', '', line) # clean sentence from numbers
            line = re.sub('[!@#$ª•ย™˚\\±“‟%„<{―=½≈»_]', '', line) # clean sentence from specific symbols 
            if len(line) > 30 : 
              sentences.append(line[30]) # add the  sentence to sentences list 
            else : 
              sentences.append(line) # add the  sentence to sentences list 
            languages.append(language) # add the  languages to languages list 
        language += 1
df['sentences'] = sentences # add sentence to the dataframe
df['languages'] = languages  # add languages to the dataframe for each sentence

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)

In [None]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'" + u'\xab'
n_letters = len(all_letters)

def unicodeToAscii(s): # # Turn a Unicode string to plain ASCII, 
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [None]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [None]:
all_languages = {0:'Catalan', 1 : 'Danish' , 3 : 'Dutch' , 4: 'French' , 5: 'German' , 6: 'Italian' ,7: 'Portuguese' ,8: 'swedish'} # dictionary for all langauges  


# ***The dataset Class***

In [None]:
class TextDataset(Dataset):

    def __init__(self,data):
        self.data = data
        self.sentences = data.iloc[:,:-1].values # take all the sentences in the dataframe for train or test data 
        self.label = data.iloc[:,-1].values # take all the langauges in the dataframe for train or test data 

        #self.sentence = sorted(self._find_files(sentences))
        
    def __len__(self):
        return len (self.sentences)


#in getitem method ,i convert the sentence into list of tensor values

    def __getitem__(self,index):
        sentence = self.sentences [index]
        sentence = sentence[0]
        features = lineToTensor(sentence) # function to convert the sentence into one-hot encoding 
        label = [self.label [index]] # No changing for the languages tensor 
        return features , torch.tensor(label)
    
    def tensor2char(self,index): # function to decode the tensor 
        sentence , label = self.__getitem__(index)
        sentence= sentence.numpy()
        text = [idx2char[idx] for idx in sentence]
        return "".join(text) , label
    
    def _find_files(self, directory, pattern='*.txt'):
        """Recursively finds all files matching the pattern."""
        files = []
        for root, dirnames, filenames in os.walk(directory):
            for filename in fnmatch.filter(filenames, pattern):
                files.append(os.path.join(root, filename))
        return files

# ***The Collate_fn class***

In [None]:
class TextCollate(object):
    """Function object used as a collate function for DataLoader."""

    def __init__(self,):
        
        pass
        
    def _collate_fn(self, batch):
          (xx, yy) = zip(*batch)
          #x_lens = [len(x) for x in xx]
          #xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
          #yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)  
          return xx , yy # in each batch it return the sentences with their labels

    def __call__(self, batch):
        return self._collate_fn(batch)

# ***Get DATA FILES and create dataframe***

In [None]:
MAX_LENGTH = df.sentences.str.len().max()
print(MAX_LENGTH) # the maximum length in sentences
df.groupby(['languages']).count() # check all the languages in the dataframe

30


Unnamed: 0_level_0,sentences
languages,Unnamed: 1_level_1
0,493026
1,128916
2,471902
3,501685
4,121479
5,89358
6,250967
7,433318


**Split the data into train and test**

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2) # split the data to 80% for train and 20% for test

***Create loader***

In [None]:
pad_collate = TextCollate()

In [None]:
BATCH_SIZE = 32 # number of sentences in each batch
num_workers = 4 # number of workers to upload the batch to the RAM
dataset_train = TextDataset(data= train)
train_iterator =  DataLoader(dataset_train, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=pad_collate, num_workers=num_workers)

dataset_test = TextDataset(data= test)
test_iterator = DataLoader(dataset_test, batch_size=BATCH_SIZE, collate_fn= pad_collate, shuffle=True, num_workers=num_workers)

In [None]:
print(len(train_iterator) , len(test_iterator)) # explore the number of iteration needed to pass throw all batches
# train_len / batch_size 

1563 1563


In [None]:
dataiter = iter(train_iterator) # iterate oven one batch just to see the structure
data = dataiter.next()
sentences , label= data # each batch contain number of sentences with their suitable languages

In [None]:
print(sentences[0]) # print the first sentence in the batch 

tensor([[[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]])


***Convert the Output probability to Language name ***

In [None]:
def categoryFromOutput(output):
  language_idx = torch.argmax(output).item() # pick the index of the maximum value
  return language_idx

# ***Create the RNN Model***

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size) # the first hidden layer
        self.i2o = nn.Linear(input_size + hidden_size, output_size) # the output  layer
        self.softmax = nn.LogSoftmax(dim=1) # use softmaxe function to calculate the probability over each class

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


# ***Hyper-parameters***

In [None]:
num_epochs = 3 # number of epochs
n_hidden = 256 # number of units
n_categories = len(all_languages) # number of classes
criterion = nn.NLLLoss() # the Loss function because its a classification problem with 8 classes.
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
Model = RNN(n_letters, n_hidden, n_categories).to(device) # initilize the model 
optimizer = torch.optim.SGD(Model.parameters(), lr=learning_rate) # Stochastic gradient descent 

***Function that return some random examples from the dataframe***

In [None]:
def randomTrainingExample(iter):
    
    line = df['sentences'].values[iter]
    cat = df['languages'].values[iter]
    category_tensor = torch.tensor([cat], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return  category_tensor, line_tensor


# ***Create Training Lopp***

In [None]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


In [None]:
import time
import math

n_iters = 10000
print_every = 1000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

# Train the model
loss =  0
num_epochs = 20
total_step = len(train_iterator)
hidden = Model.initHidden()
for epoch in range(num_epochs):
    for i, (sentences, labels) in enumerate(train_iterator):

        for j , sent in enumerate(sentences):

            hidden = Model.initHidden()  # create the initial hidden tensor
            hidden = hidden.to(device)  # use cuda device to the hidden tensor
            sent = sent.to(device) # use cuda device to the sentence tensor
            for k in range(sent.size()[0]):
                output, hidden = Model(sent[k], hidden)

            loss = criterion(output, labels[j].to(device)) # calculate the loss between the predict and target
            
            # Backward and optimize
            optimizer.zero_grad() # turn the gradient to zero
            loss.backward() # calculate the derivative of the loss over the model
            torch.nn.utils.clip_grad_norm_(Model.parameters(), 0.25) # using the gradient clipper to avoid exploiding
            optimizer.step() # Apply the parameters update
            #hidden = hidden.detach()

        if (i+1) % 1000 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# ***Create the Testing loop***

In [None]:
# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    total_step = len(test_iterator)
    for epoch in range(num_epochs):
        for i, (sentences, labels) in enumerate(test_iterator):

            for j , sent in enumerate(sentences):

                hidden = Model.initHidden()
                
                hidden = hidden.to(device) 
                sent = sent.to(device)
                for k in range(sent.size()[0]):
                    output, hidden = Model(sent[k], hidden)
                    
                predicted = categoryFromOutput(output) # pick the maximun value on the output tensor
                
                total += labels[j].size(0)
                
                correct += (predicted == labels[j]).sum().item() # accumulate the number of correct prediction of the target value equal the output value

    print('Test Accuracy of the model on the 10000 test Sentences: {} %'.format(100 * correct / total)) 

# Save the model checkpoint
torch.save(Model.state_dict(), 'model.ckpt')