<a href="https://colab.research.google.com/github/FrancescoMorri/Language_Classification/blob/main/Notebooks/Model_%26_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Define here your **PATH**:

In [3]:
#this if you work with Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PATH = "/content/drive/MyDrive/language"

## Importing all the libraries

In [5]:
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import time
from torchsummary import summary
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
import numpy as np

# Making the Dataset

In [6]:
import pandas as pd

In [7]:
words = pd.read_csv(PATH+"/words_all_unique.csv", usecols=['spelling', 'lexicality'])

<p> In order to make the dataset we need to encode the words in some way. We will use a simple method: simply assigning a binary vector to each letter, then putting together all the vector that make a word, eventually adding 0s at the end if the word is shorter that the longest word in the dataset</p>

In [8]:
def word_to_vec(word, max_length):
    n = len(word)
    vec = ''
    for i in range(n):
        cur_char = word[i]
        idx = ord(cur_char) - 97
        tmp = (str(0)*idx) + str(1) + (str(0)*(25-idx))
        vec = vec + tmp
    if n < max_length:
        exce = max_length - n
        vec = vec + (str(0)*26*exce)
    output = []
    for v in vec:
        output.append(float(v))
    return output

<p> In order to make the dataset we need to encode the words in some way. We will use a simple method: simply assigning a binary vector to each letter, then putting together all the vector that make a word, eventually adding 0s at the end if the word is shorter that the longest word in the dataset</p>

Now we need a basic function to make the labels vector.

In [9]:
def label_maker(lexicality):
    if lexicality == 'W':
        return 0
    elif lexicality == 'N':
        return 1

We can now define the Dataset class in the standard way.

In [10]:
class WordsDataset(Dataset):

    def __init__(self):
        self.samples = []

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()
        
        input = self.samples[idx][0]
        label = self.samples[idx][1]
        return input, label
    
    def __addsample__(self, spelling, lexicality, max_length=5):
        if (type(spelling) == type("str")):
            input = torch.tensor(word_to_vec(spelling, max_length=max_length), dtype=torch.float32)
            if (len(input) > max_length*26):
                print("PROBLEM")
                print(len(input), spelling)
            else:
                # here if it is torch is for the MSELoss, if it is an INT is for the crossentropy
                #label = torch.tensor(label_maker(lexicality), dtype=torch.float32)
                label = label_maker(lexicality)
                self.samples.append([input, label])
        else:
            print("Something Strange:", end='\t')
            print(spelling)

    def __removesample__(self, idx=0, value=None):
        '''
        If value is something, the element corresponding to that value is removed.
        Else the element at index idx is popped.
        '''
        if (value is not None):
            self.samples.remove(value)
        else:
            self.samples.pop(0)
        

In [11]:
dataset = WordsDataset()

The dataset is now empty, we can fill it with all our words.

In [14]:
MAX_LENGTH = 5

counting = 0
for w in words[words['lexicality']=='W']['spelling']:
    if (type(w) == type("str")):
        if (len(w) > MAX_LENGTH):
            pass
        else:
            dataset.__addsample__(w, 'W')
            counting += 1

count_non = 0
for w in words[words['lexicality']=='N']['spelling']:
    if (type(w) == type("str")):
        if (len(w) > MAX_LENGTH):
            pass
        else:
            dataset.__addsample__(w, 'N')
            count_non +=1

    if count_non == counting:
        break


print("\n\nWords: ",counting)
print("Non-Words: ",count_non)
print("Tot Elements in dataset: ", dataset.__len__())

PROBLEM
182 yo-yo
PROBLEM
198 TRUE
PROBLEM
154 I
PROBLEM
154 Inf
PROBLEM
252 FALSE
PROBLEM
164 he?te
PROBLEM
164 we?te
PROBLEM
164 e?tal


Words:  7033
Non-Words:  7033
Tot Elements in dataset:  42174


In [15]:
ratio_test_train = 0.2
test_el = round(dataset.__len__()*ratio_test_train)
train_el = dataset.__len__() - test_el

trainset, testset = torch.utils.data.random_split(dataset, [train_el, test_el])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32)
testloader = torch.utils.data.DataLoader(testset, batch_size=32)

# Creating Network Model

In [16]:
class Words_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(in_features=130, out_features=64)
        self.linear2 = nn.Linear(in_features=64, out_features=32)
        self.linear3 = nn.Linear(in_features=32, out_features=2)

        self.act = nn.ReLU()

    def forward(self, x):
        out = self.linear1(x)
        out = self.act(out)
        out = self.linear2(out)
        out = self.act(out)
        out = self.linear3(out)

        return out

For the training we will use the GPU, even though is a fairly small network.

In [17]:
net = Words_Net()
want_cuda = True
have_cuda = torch.cuda.is_available()
if want_cuda and have_cuda:
    net.cuda()
    print(torch.cuda.get_device_name())
else:
    print ("No cuda available!\n")
summary(net, (130,))

Tesla T4
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]           8,384
              ReLU-2                   [-1, 64]               0
            Linear-3                   [-1, 32]           2,080
              ReLU-4                   [-1, 32]               0
            Linear-5                    [-1, 2]              66
Total params: 10,530
Trainable params: 10,530
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.04
Estimated Total Size (MB): 0.04
----------------------------------------------------------------


We define the training parameters, in this case we will use the CrossEntropy loss and the SGD algorithm to train the network.

In [18]:
EPOCH = 900
learn = 0.1

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learn, weight_decay = 4e-3)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [300,600], gamma=0.1)

The training function will return the loss of that epoch, mediated over the iteration on the dataset. It may also be added the option to acquire interesting data.

In [19]:
def training(acquire = False , PATH = None):
    running_loss = 0.0
    losst = 0
    index = 0
    for i, data in enumerate(trainloader, 0):
        # get the inputs, maybe they need to be tensors?
        inputs, labels = data

        if want_cuda and have_cuda:
          inputs = inputs.cuda()
          labels = labels.cuda()

        # need to squeeze if MSELoss
        outputs = net(inputs)

        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losst +=loss.item()
        index +=1
    return losst/index

Now we can do the actual training of the network.

In [None]:
start =time.time()
graph_data = []
mean_loss = 0
for epoch in range(EPOCH):
    loss = training()    
    print("Epoch: ", epoch, " Loss: %.10f"%(loss))
    mean_loss += loss
    graph_data.append((epoch, loss))
    np.savetxt( PATH+"/graphs/loss_data.csv", graph_data, delimiter=',')
    
    if (epoch % 100 == 99):
        #net.eval()
        #y = validate()
        #val_graph = np.append(val_graph, [[epoch, y]], axis=0)
        print("Mean loss: %.10f"%(mean_loss/100))
        mean_loss = 0
        print('Estimated time: %.3f min' %((EPOCH- epoch)*(time.time() - start)/(60*epoch)) )
        #torch.save(net.state_dict(), "/content/drive/My Drive/Saved_Sets/esperimento_7/32b/withLR2e-5/32_lr2e5_1000")
    scheduler.step()
    #net.train()
        
elapsed_time = time.time() - start
torch.save(net.state_dict(), PATH+"/net/feed_forward_1")
print('Finished Training (elapsed time %.3f min)' %(elapsed_time/60))

Epoch:  0  Loss: 0.6873655858
Epoch:  1  Loss: 0.6498368101
Epoch:  2  Loss: 0.6342645798
Epoch:  3  Loss: 0.6229574554
Epoch:  4  Loss: 0.6097975092
Epoch:  5  Loss: 0.5961713341
Epoch:  6  Loss: 0.5838199671
Epoch:  7  Loss: 0.5732778877
Epoch:  8  Loss: 0.5640498589
Epoch:  9  Loss: 0.5555606156
Epoch:  10  Loss: 0.5482720257
Epoch:  11  Loss: 0.5417290872
Epoch:  12  Loss: 0.5359343747
Epoch:  13  Loss: 0.5312890830
Epoch:  14  Loss: 0.5264382416
Epoch:  15  Loss: 0.5224703237
Epoch:  16  Loss: 0.5196510894
Epoch:  17  Loss: 0.5168048670
Epoch:  18  Loss: 0.5145952646
Epoch:  19  Loss: 0.5128606592
Epoch:  20  Loss: 0.5108685769
Epoch:  21  Loss: 0.5089443077
Epoch:  22  Loss: 0.5074048250
Epoch:  23  Loss: 0.5055898273
Epoch:  24  Loss: 0.5046509256
Epoch:  25  Loss: 0.5031963109
Epoch:  26  Loss: 0.5021120718
Epoch:  27  Loss: 0.5004744379
Epoch:  28  Loss: 0.4993738315
Epoch:  29  Loss: 0.4979964234
Epoch:  30  Loss: 0.4968324009
Epoch:  31  Loss: 0.4956372948
Epoch:  32  Loss: 

In [None]:
correct = 0
total = 0
vloss = 0
with torch.no_grad():
    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        if want_cuda and torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
        outputs = net(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss += loss.item()
        #this for MSELoss
        # predicted = torch.round(outputs)
        #this for crossentropy
        _, predicted = torch.max(outputs,1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print("Accuracy: ", round(correct/total *100, 4), "%")

Accuracy:  65.5333 %
