<a href="https://colab.research.google.com/github/FrancescoMorri/Language_Classification/blob/main/notebooks/Model_%26_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Define here your **PATH**:

In [85]:
#this if you work with Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
PATH = "/content/drive/MyDrive/language/"

## Importing all the libraries

In [87]:
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import time
from torchsummary import summary
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
import numpy as np

# Making the Dataset

## Basic Word Encoding

In [69]:
import pandas as pd

In [70]:
words = pd.read_csv(PATH+"/words_all_unique.csv", usecols=['spelling', 'lexicality'])

In order to make the dataset we need to encode the words in some way. We will use a simple method: simply assigning a binary vector to each letter, then putting together all the vector that make a word, eventually adding 0s at the end if the word is shorter that the longest word in the dataset.</br>
</br>
If we already have everything we just need to load the dataloader.


In [71]:
def word_to_vec(word, max_length):
    n = len(word)
    vec = ''
    for i in range(n):
        cur_char = word[i]
        idx = ord(cur_char) - 97
        tmp = (str(0)*idx) + str(1) + (str(0)*(25-idx))
        vec = vec + tmp
    if n < max_length:
        exce = max_length - n
        vec = vec + (str(0)*26*exce)
    output = []
    for v in vec:
        output.append(float(v))
    return output

In [None]:
def word_to_vec2(word, max_length):
    len_w = len(word)
    chars = [ord(c) for c in word]
    max_char = 122. # this is z
    normal = [round(c/max_char, 5) for c in chars]
    if len_w < max_length:
        diff = max_length - len_w
        zeros =  [0 for i in range(diff)]
        normal.extend(zeros)
    return normal

In [None]:
print(word_to_vec2("hello", max_length=5), word_to_vec2("hellk", max_length=5))

[0.85246, 0.82787, 0.88525, 0.88525, 0.90984] [0.85246, 0.82787, 0.88525, 0.88525, 0.87705]


Now we need a basic function to make the labels vector.

In [72]:
def label_maker(lexicality):
    if lexicality == 'W':
        return 0
    elif lexicality == 'N':
        return 1

We can now define the Dataset class in the standard way.

In [88]:
class WordsDataset(Dataset):

    def __init__(self):
        self.samples = []

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()
        
        input = self.samples[idx][0]
        label = self.samples[idx][1]
        return input, label
    
    def __addsample__(self, spelling, lexicality, max_length=4):
        if (type(spelling) == type("str")):
            input = torch.tensor(word_to_vec(spelling, max_length=max_length), dtype=torch.float32)
            #check = [False if (i > 1. or i < 0.) else True for i in input]
            #if not all(check):
            #    print("PROBLEM")
            #    print(len(input), spelling)
            if (len(input) > max_length*26):
                print("PROBLEM")
                print(len(input), spelling)
            else:
                # here if it is torch is for the MSELoss, if it is an INT is for the crossentropy
                label = torch.tensor(label_maker(lexicality), dtype=torch.float32)
                #label = label_maker(lexicality)
                self.samples.append([input, label])
        else:
            print("Something Strange:", end='\t')
            print(spelling)

    def __removesample__(self, idx=0, value=None):
        '''
        If value is something, the element corresponding to that value is removed.
        Else the element at index idx is popped.
        '''
        if (value is not None):
            self.samples.remove(value)
        else:
            self.samples.pop(0)
        

In [74]:
dataset = WordsDataset()

The dataset is now empty, we can fill it with all our words.

In [75]:
MAX_LENGTH = 12

counting = 0
for w in words[words['lexicality']=='W']['spelling']:
    if (type(w) == type("str")):
        if (len(w) > MAX_LENGTH):
            pass
        else:
            dataset.__addsample__(w, 'W', max_length=MAX_LENGTH)
            counting += 1

count_non = 0
for w in words[words['lexicality']=='N']['spelling']:
    if (type(w) == type("str")):
        if (len(w) > MAX_LENGTH):
            pass
        else:
            dataset.__addsample__(w, 'N', max_length=MAX_LENGTH)
            count_non +=1

    if count_non == counting:
        break


print("\n\nWords: ",counting)
print("Non-Words: ",count_non)
print("Tot Elements in dataset: ", dataset.__len__())

PROBLEM
364 itsy-bitsy
PROBLEM
364 t-shirt
PROBLEM
364 yo-yo
PROBLEM
364 itty-bitty
PROBLEM
380 TRUE
PROBLEM
336 I
PROBLEM
336 Inf
PROBLEM
434 FALSE
PROBLEM
364 teeny-weeny
PROBLEM
346 se?tes
PROBLEM
346 desple?tive
PROBLEM
346 clage?tet
PROBLEM
346 ete?tion
PROBLEM
346 he?te
PROBLEM
346 we?te
PROBLEM
314 f_ttered
PROBLEM
346 che?ting
PROBLEM
346 ase?tus
PROBLEM
346 apple?te
PROBLEM
346 se?tard
PROBLEM
346 duppe?tic
PROBLEM
346 line?te
PROBLEM
346 he?ting
PROBLEM
346 porre?te
PROBLEM
346 e?tal


Words:  56069
Non-Words:  56069
Tot Elements in dataset:  112113


In [78]:
ratio_test_train = 0.25
test_el = round(dataset.__len__()*ratio_test_train)
train_el = dataset.__len__() - test_el

trainset, testset = torch.utils.data.random_split(dataset, [train_el, test_el])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32)
testloader = torch.utils.data.DataLoader(testset, batch_size=32)

Here we can save the dataloader in order to have reproducible results.

In [79]:
torch.save(trainloader, PATH+"trainloader_12chars_MSE.pth")
torch.save(testloader, PATH+"testloader_12chars_MSE.pth")

## Char2Vec

In [None]:
import pandas as pd

In [None]:
words = pd.read_csv(PATH+"/words_all_unique.csv", usecols=['spelling', 'lexicality'])

In [None]:
!pip install chars2vec
import chars2vec

Collecting chars2vec
[?25l  Downloading https://files.pythonhosted.org/packages/04/0a/8c327aae23e0532d239ec7b30446aca765eb5d9547b4c4b09cdd82e49797/chars2vec-0.1.7.tar.gz (8.1MB)
[K     |████████████████████████████████| 8.1MB 6.2MB/s 
[?25hBuilding wheels for collected packages: chars2vec
  Building wheel for chars2vec (setup.py) ... [?25l[?25hdone
  Created wheel for chars2vec: filename=chars2vec-0.1.7-cp36-none-any.whl size=8111095 sha256=eb1052fd704a7884612f4c0f4bd07ccc748d84880e16f231696409a1c244ba33
  Stored in directory: /root/.cache/pip/wheels/97/b6/65/d7e778ef1213ec77d315aea0f536068b96e36cc94c02abbfde
Successfully built chars2vec
Installing collected packages: chars2vec
Successfully installed chars2vec-0.1.7


In [None]:
from torch.utils.data import Dataset
import torch

In [None]:
def label_maker(lexicality):
    if lexicality == 'W':
        return 0
    elif lexicality == 'N':
        return 1

In [None]:
class WordsDataset(Dataset):

    def __init__(self):
        self.samples = []

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()
        
        input = self.samples[idx][0]
        label = self.samples[idx][1]
        return input, label
    
    def __addsample__(self, vector_word, lexicality):

        input = torch.tensor(vector_word, dtype=torch.float32)
        #label = torch.tensor(label_maker(lexicality), dtype=torch.float32)
        label = label_maker(lexicality)
        self.samples.append([input, label])

    def __removesample__(self, idx=0, value=None):
        '''
        If value is something, the element corresponding to that value is removed.
        Else the element at index idx is popped.
        '''
        if (value is not None):
            self.samples.remove(value)
        else:
            self.samples.pop(0)
        

In [None]:
c2v_model = chars2vec.load_model('eng_50')
real = []
for i,w in enumerate(words[words['lexicality']=='W']['spelling']):
    if (type(w) == type("str")):
        real.append(w)


nonw = []
for i,w in enumerate(words[words['lexicality']=='N']['spelling']):
    if (type(w) == type("str")):
        nonw.append(w)


real_word_embed = c2v_model.vectorize_words(real)
non_word_embed = c2v_model.vectorize_words(nonw)

In [None]:
print(real_word_embed.shape)
print(non_word_embed.shape)

(61853, 50)
(329845, 50)


In [None]:
dataset = WordsDataset()

In [None]:
count = 0
for rw in real_word_embed:
    dataset.__addsample__(rw, "W")
    count += 1
count2 = 0
for nw in non_word_embed:
    dataset.__addsample__(nw, "N")
    count2 +=1
    if count2 == count:
        break

print(dataset.__len__())

123706


In [None]:
ratio_test_train = 0.2
test_el = round(dataset.__len__()*ratio_test_train)
train_el = dataset.__len__() - test_el

trainset, testset = torch.utils.data.random_split(dataset, [train_el, test_el])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128)
testloader = torch.utils.data.DataLoader(testset, batch_size=128)

In [None]:
torch.save(trainloader, PATH+"trainloader_char2vec_cross.pth")
torch.save(testloader, PATH+"testloader_char2vec_cross.pth")

## Bigrams Dataset

In [None]:
import pandas as pd

In [None]:
words = pd.read_csv(PATH+"/words_all_unique.csv", usecols=['spelling', 'lexicality'])

In [None]:
def create_dict(text, normal=False):
    bigrams = {}
    for w in text:
        for i in range(0, len(w)-1):
            big = w[i]+w[i+1]
            keys = bigrams.keys()
            check = [k == big for k in bigrams.keys()]
            if any(check):
                bigrams[big] += 1
            else:
                bigrams[big] = 1

    if normal:
        max_dict = max(bigrams.values())
        bigrams = {k: v/max_dict for k,v in bigrams.items()}
    return bigrams

In [None]:
def encode_words(words, dictio):
    data = []
    for w in words:
        vect = []
        for i in range(0, len(w)-1):
            big = w[i] + w[i+1]
            val = dictio[big]
            vect.append(val)
        data.append(vect)

    return data

In [None]:
real = []
for i,w in enumerate(words[words['lexicality']=='W']['spelling']):
    if (type(w) == type("str")):
        real.append(w)


nonw = []
for i,w in enumerate(words[words['lexicality']=='N']['spelling']):
    if (type(w) == type("str")):
        nonw.append(w)

In [None]:
real_count = len(real)
select_non = nonw[:real_count]
tot = []
tot.extend(real)
tot.extend(select_non)
dictionary = create_dict(tot, normal=True)

In [None]:
real_encoded = encode_words(real, dictio=dictionary)
non_encoded = encode_words(select_non, dictio=dictionary)

In [None]:
def label_maker(lexicality):
    if lexicality == 'W':
        return 0
    elif lexicality == 'N':
        return 1

In [None]:
class WordsDataset(Dataset):

    def __init__(self):
        self.samples = []

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()
        
        input = self.samples[idx][0]
        label = self.samples[idx][1]
        return input, label
    
    def __addsample__(self, vector_word, lexicality, max_length):
        
        if len(vector_word) < max_length:
            diff = max_length-len(vector_word)
            zeros = [0 for i in range(diff)]
            vector_word.extend(zeros)
        
        input = torch.tensor(vector_word, dtype=torch.float32)
        #label = torch.tensor(label_maker(lexicality), dtype=torch.float32)
        label = label_maker(lexicality)
        self.samples.append([input, label])

    def __removesample__(self, idx=0, value=None):
        '''
        If value is something, the element corresponding to that value is removed.
        Else the element at index idx is popped.
        '''
        if (value is not None):
            self.samples.remove(value)
        else:
            self.samples.pop(0)
        

In [None]:
dataset = WordsDataset()

In [None]:
MAX_LENGTH = 21

count = 0
for rw in real_encoded:
    if len(rw) < MAX_LENGTH:
        dataset.__addsample__(rw, "W", MAX_LENGTH)
        count += 1

count2 = 0
for nw in non_encoded:
    if len(nw) < MAX_LENGTH:
        dataset.__addsample__(nw, "N", MAX_LENGTH)
        count2 +=1
        if count2 == count:
            break

print(dataset.__len__())

123700


In [None]:
ratio_test_train = 0.2
test_el = round(dataset.__len__()*ratio_test_train)
train_el = dataset.__len__() - test_el

trainset, testset = torch.utils.data.random_split(dataset, [train_el, test_el])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128)
testloader = torch.utils.data.DataLoader(testset, batch_size=128)

In [None]:
torch.save(trainloader, PATH+"trainloader_bigrams_complete_cross.pth")
torch.save(testloader, PATH+"testloader_bigrams_complete_cross.pth")

## Creating Network Model

### Conv

In [None]:
class Conv_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=5) #50*1 -> 46*8
        self.drop1 = nn.Dropout()
        self.batch1 = nn.BatchNorm1d(num_features=8)
        self.max1 = nn.MaxPool1d(kernel_size=2, stride=2) #46*8 -> 23*8
        self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3) #23*8 -> 21*16
        self.drop2 = nn.Dropout()
        self.batch2 = nn.BatchNorm1d(num_features=16)
        self.max2 = nn.MaxPool1d(kernel_size=3, stride=2) #21*16 -> 10*16
        #self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=5) #42*16 -> 42*16
        #self.drop3 = nn.Dropout()
        #self.batch3 = nn.BatchNorm1d(num_features=16)

        self.linear1 = nn.Linear(in_features=10*16, out_features=32)
        self.linear2 = nn.Linear(in_features=32, out_features=2)

        self.act = nn.ReLU()

    def forward(self, x):
        out = self.conv1(x)
        out = self.drop1(out)
        out = self.batch1(out)
        out = self.act(out)
        out = self.max1(out)

        out = self.conv2(out)
        out = self.drop2(out)
        out = self.batch2(out)
        out = self.act(out)
        out = self.max2(out)

        out = torch.flatten(out, 1)

        out = self.linear1(out)
        out = self.act(out)
        out = self.linear2(out)

        return out

### Feedforward

In [89]:
class Words_Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(312, 400)
        self.batch1 = nn.BatchNorm1d(400) 
        self.drop1 = nn.Dropout()
        self.linear2 = nn.Linear(400, 256)
        self.batch2 = nn.BatchNorm1d(256)
        self.drop2 = nn.Dropout()
        self.linear3 = nn.Linear(256, 64)
        #self.batch3 = nn.BatchNorm1d(64)
        #self.drop3 = nn.Dropout()
        self.linear4 = nn.Linear(64, 32)
        #self.drop4 = nn.Dropout()
        self.linear5 = nn.Linear(32, 1)

        self.act = nn.ReLU()

    def forward(self, x):
        out = self.linear1(x)
        out = self.drop1(out)
        out = self.batch1(out)
        out = self.act(out)

        out = self.linear2(out)
        out = self.drop2(out)
        out = self.batch2(out)
        out = self.act(out)

        out = self.linear3(out)
        #out = self.drop3(out)
        #out = self.batch3(out)
        out = self.act(out)

        out = self.linear4(out)
        #out = self.drop4(out)
        out = self.act(out)

        out = self.linear5(out)

        return out


### Load the dataloader

In [90]:
trainloader = torch.load(PATH+"trainloader_12chars_MSE.pth")
testloader = torch.load(PATH+"testloader_12chars_MSE.pth")

## Training
For the training we will use the GPU, even though is a fairly small network.

In [91]:
net = Words_Net()
#net.load_state_dict(torch.load(PATH+"/net/feed_forward_10chars_MSE_2_3"))

want_cuda = True
have_cuda = torch.cuda.is_available()
if want_cuda and have_cuda:
    net.cuda()
    print(torch.cuda.get_device_name())
else:
    print ("No cuda available!\n")
summary(net, (312,))

Tesla T4
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 400]         125,200
           Dropout-2                  [-1, 400]               0
       BatchNorm1d-3                  [-1, 400]             800
              ReLU-4                  [-1, 400]               0
            Linear-5                  [-1, 256]         102,656
           Dropout-6                  [-1, 256]               0
       BatchNorm1d-7                  [-1, 256]             512
              ReLU-8                  [-1, 256]               0
            Linear-9                   [-1, 64]          16,448
             ReLU-10                   [-1, 64]               0
           Linear-11                   [-1, 32]           2,080
             ReLU-12                   [-1, 32]               0
           Linear-13                    [-1, 1]              33
Total params: 247,729
Trainabl

We define the training parameters, in this case we will use the CrossEntropy loss and the SGD algorithm to train the network.

In [92]:
START = 0
EPOCH = 600
learn = 0.1

criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=learn, weight_decay=4e-3)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [100,200, 400], gamma=0.1)

The training function will return the loss of that epoch, mediated over the iteration on the dataset. It may also be added the option to acquire interesting data.

In [93]:
def accuracy():
    correct = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            inputs, labels = data
            if want_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            # need to squeeze if MSELoss
            outputs = net(inputs).squeeze()
            #loss = criterion(outputs, labels)
            #loss += loss.item()
            #this for MSELoss
            predicted = torch.round(outputs)
            #this for crossentropy
            #_, predicted = torch.max(outputs,1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return (round(correct/total *100, 4))

In [94]:
def overfit_check():
    correct = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            if want_cuda and torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            # need to squeeze if MSELoss
            outputs = net(inputs).squeeze()
            #loss = criterion(outputs, labels)
            #loss += loss.item()
            #this for MSELoss
            predicted = torch.round(outputs)
            #this for crossentropy
            #_, predicted = torch.max(outputs,1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return (round(correct/total *100, 4))   

In [95]:
def training(acquire = False , PATH = None):
    running_loss = 0.0
    losst = 0
    index = 0
    for i, data in enumerate(trainloader, 0):
        # get the inputs, maybe they need to be tensors?
        inputs, labels = data

        if want_cuda and have_cuda:
          inputs = inputs.cuda()
          labels = labels.cuda()

        # need to squeeze if MSELoss
        outputs = net(inputs).squeeze()

        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losst +=loss.item()
        index +=1
    return losst/index

Now we can do the actual training of the network.

In [96]:
start =time.time()
graph_data = np.empty((0,4))
old_data = False
mean_loss = 0
old_acc = 0
acc_check = 0
best_epoch = 0

for epoch in range(START, EPOCH):
    loss = training()    
    print("Epoch: ", epoch, " Loss: %.10f"%(loss))
    mean_loss += loss

    if (epoch % 100 == 99):
        print("Mean loss: %.10f"%(mean_loss/100))
        mean_loss = 0
        print('Estimated time: %.3f min' %((EPOCH- epoch)*(time.time() - start)/(60*epoch)) )


    if (epoch%5 == 0):
        net.eval()
        over_check = overfit_check()
        acc = accuracy()
        acc_check += 1
        print("\n------>  Accuracy on TestSet: %.4f  <------"%(acc))
        print("------>  Accuracy on TrainSet: %.4f  <------\n"%(over_check))

        graph_data = np.append(graph_data, [[epoch, loss, acc, over_check]], axis=0)

        if old_data:
            f = open(PATH+"graphs/feed_forward_12chars_MSE_1.csv", 'a')
            np.savetxt(f, graph_data)
            f.close()
            graph_data = np.empty((0,4))
        else:
            np.savetxt(PATH+"graphs/feed_forward_12chars_MSE_1.csv", graph_data)
            graph_data = np.empty((0,4))
            old_data = True
        
        if (acc > old_acc):
            old_acc = acc
            torch.save(net.state_dict(), PATH+"net/feed_forward_12chars_MSE_1")
            acc_check = 0
            best_epoch = epoch

    '''
    if (acc_check > 200):
        print("NET STOPPED LEARNING!!")
        print("\nEpoch:%.d \nLoss:%.10f \nBest Accuracy:%.4f"%(epoch, loss, old_acc))
        break
    '''

    scheduler.step()
    net.train()
        
elapsed_time = time.time() - start
print('Finished Training (elapsed time %.3f min)' %(elapsed_time/60))
net.load_state_dict(torch.load(PATH+"net/feed_forward_12chars_MSE_1"))
print("\nBest net loaded!!")
print("\nBest Test Accuracy:%.5f \nEpoch of best accuracy:%.d \nFinal Accuracy on Train:%.5f"%(old_acc, best_epoch, over_check))

Epoch:  0  Loss: 0.2484271061

------>  Accuracy on TestSet: 53.4965  <------
------>  Accuracy on TrainSet: 54.3533  <------

Epoch:  1  Loss: 0.2429579442
Epoch:  2  Loss: 0.2408162436
Epoch:  3  Loss: 0.2404575586
Epoch:  4  Loss: 0.2400766886
Epoch:  5  Loss: 0.2398237517

------>  Accuracy on TestSet: 59.3728  <------
------>  Accuracy on TrainSet: 61.0323  <------

Epoch:  6  Loss: 0.2395697270
Epoch:  7  Loss: 0.2395236165
Epoch:  8  Loss: 0.2396345064
Epoch:  9  Loss: 0.2395070399
Epoch:  10  Loss: 0.2397915876

------>  Accuracy on TestSet: 59.6404  <------
------>  Accuracy on TrainSet: 61.2975  <------

Epoch:  11  Loss: 0.2397199101
Epoch:  12  Loss: 0.2396012527
Epoch:  13  Loss: 0.2394333616
Epoch:  14  Loss: 0.2395844782
Epoch:  15  Loss: 0.2394443075

------>  Accuracy on TestSet: 59.5191  <------
------>  Accuracy on TrainSet: 60.8836  <------

Epoch:  16  Loss: 0.2395601587
Epoch:  17  Loss: 0.2391925038
Epoch:  18  Loss: 0.2395507946
Epoch:  19  Loss: 0.2396191405
Ep

In [97]:
net.load_state_dict(torch.load(PATH+"net/feed_forward_12chars_MSE_1"))
correct = 0
total = 0
vloss = 0
net.eval()
with torch.no_grad():
    for i, data in enumerate(testloader, 0):
        inputs, labels = data
        if want_cuda and torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
        #squeeze for MSE
        outputs = net(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss += loss.item()
        #this for MSELoss
        predicted = torch.round(outputs)
        #this for crossentropy
        #_, predicted = torch.max(outputs,1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print("Accuracy: ", round(correct/total *100, 4), "%")

Accuracy:  75.1035 %
