In [243]:
import fasttext
import fasttext.util
import time
import torch
import random
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
from torch.nn import Linear, Sigmoid, ReLU, Dropout, Embedding, MSELoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm1d, BatchNorm2d, Dropout
from torch.optim import Adam, SGD
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ft = fasttext.load_model("cc.tr.100.bin")



In [5]:
fasttext.util.reduce_model(ft, 100)

<fasttext.FastText._FastText at 0x7fc8ea166dc0>

In [29]:
ft.save_model('cc.tr.100.bin')

In [3]:
ft.get_dimension()

100

In [4]:
word_vectors = torch.from_numpy(ft.get_input_matrix())
word_vectors[1]

tensor([-1.1749, -0.1436, -0.6654, -0.1926, -0.3531, -0.0698, -0.0701, -0.2173,
         0.4043, -0.0094, -0.0227,  0.2733,  0.0446, -0.0148, -0.1951,  0.2377,
         0.1824,  0.2891, -0.2884, -0.0603,  0.2166,  0.2423, -0.0496, -0.1195,
        -0.1218, -0.3270,  0.1299, -0.1416,  0.2645, -0.0136, -0.0608,  0.0016,
         0.4760,  0.0261, -0.0905, -0.0853, -0.2786,  0.1290,  0.0092, -0.2808,
         0.0391, -0.0272, -0.1067,  0.2834, -0.2156,  0.0348, -0.0155,  0.1579,
        -0.0482, -0.0610, -0.0649,  0.2014,  0.0933,  0.0381,  0.0060,  0.0237,
         0.1157, -0.0642,  0.0217, -0.0306,  0.0130, -0.0622,  0.0378,  0.1764,
         0.0881, -0.1186,  0.0831,  0.0966, -0.0085, -0.0857,  0.2434, -0.1090,
         0.1817,  0.0233, -0.0803,  0.0203, -0.0254,  0.1282,  0.0634,  0.0522,
        -0.0733, -0.0209, -0.0849,  0.0318,  0.0038, -0.0284,  0.2016, -0.0319,
         0.1466, -0.1054,  0.1788, -0.0199, -0.0951, -0.1183,  0.0230,  0.0398,
         0.0410, -0.0598,  0.1671, -0.03

In [5]:
words = ft.words
#words
#print(ft[words[1]])
isp = ft["ispanyol"]
las = ft["laştıramadıklarımız"]
#print(np.add(isp,las))
#print(word_dict["anamurlulaştırabilemediklerimizdekilerden"])
fasttext.tokenize('anamurlulaştırabilemediklerimizdekilerden misiniz')

['anamurlulaştırabilemediklerimizdekilerden', 'misiniz']

In [6]:
word_dict = {words[i]: ft[words[i]] for i in range(len(words)) }

In [149]:
start = time.time()
for i in range(130000):
    n = words[int(np.random.rand()*40000)]
    ft[n]
end = time.time()
print(end-start)

1.0694279670715332


In [132]:
ft.get_subwords('ispanyollaştıramadıklarımızdan')
ft["spany"]
ft.get_subwords("pany")

(['<pany', 'pany>'], array([2919705, 2308259]))

In [74]:
#Concatenates a zero vector at location 0 for padding
def padded_weights(weights):
    
    zero = torch.zeros([1, 100], dtype=torch.int32)
    
    weights = torch.cat((zero, weights), 0)
    
    return weights, 0

In [75]:
#Returns the Embedding layer with pre-trained weights for word vectors
def embedding_layer(weights):
    
    embedding = nn.Embedding.from_pretrained(weights, freeze=True, padding_idx=0)
    
    return embedding

In [175]:
vec = np.zeros((100,1))
vec = np.array([ft["merhaba"]]).T
vector = torch.tensor(ft['merhaba'])
vector = vector.reshape((10,10))
vector

tensor([[-0.0152,  0.0435, -0.0073, -0.0884,  0.0664, -0.0027, -0.0515, -0.1511,
          0.0180, -0.1195],
        [ 0.0248,  0.1206, -0.0402,  0.0265,  0.0137, -0.1065,  0.0007, -0.1325,
          0.0063, -0.0888],
        [ 0.0185,  0.0195, -0.1128,  0.0521,  0.0658, -0.0568,  0.0518, -0.0650,
         -0.1146,  0.0599],
        [ 0.0449, -0.1273,  0.0169,  0.0863,  0.0446,  0.0424, -0.0136,  0.1266,
         -0.0532,  0.0186],
        [-0.0458, -0.0063,  0.0399,  0.0297, -0.0477, -0.0396,  0.0121, -0.0358,
          0.0650, -0.0146],
        [ 0.0074,  0.0594, -0.0624,  0.0389,  0.0395,  0.0046,  0.0941,  0.0610,
          0.1056, -0.0180],
        [ 0.0169,  0.0095,  0.0334,  0.0096,  0.0335,  0.0078,  0.0240,  0.0289,
         -0.0027,  0.0461],
        [ 0.0061, -0.0039, -0.0721,  0.0016, -0.0369, -0.0094, -0.0216, -0.0137,
         -0.0080, -0.0882],
        [ 0.0066,  0.0192,  0.0198,  0.0082,  0.0457, -0.0351,  0.0222,  0.0817,
          0.0192,  0.0561],
        [ 0.0330,  

In [38]:
#Given the input string and the fasttext object, returns the vectorized representations obtained
#from the embeddings of the fasttext model
#words: a batch of strings that are the inputs to the model
#padding: the max number of words per input. if num_words is less than padding, fill with zero vectors
#in_vector: [batch_size, num_inp_words, (input_shape)] Tensor - reshape the vectors to 
#           (input_shape) matrices and stack them as channels
def get_input_vectors(words, padding, ft, input_shape):
    
    tokens = []
    batch_size = len(words)
    
    h, w = input_shape
    
    #loop over the batches to tokenize the inputs
    for i in range(batch_size):
        #Tokenize words using default fasttext tokenizer, which creates tokens 
        # by dividing splitting at word separating chars
        tokens.append(fasttext.tokenize(words[i]))

    #Create a matrix with batch_size batches, num token channels and 10x10 matrices to store the 100dim embeddings
    in_vector = np.zeros((batch_size, padding, h, w))
    
    
    #cycle over the tokens and get their vectors, reshape them to 10x10 and store in the corresponding 
    #channel in the return variable
    for j in range(len(tokens)):
        
        i = 0

        for token in tokens[j]:
            
            vector = torch.tensor(ft[token].astype(np.double))
            #print(vector.shape)
            vector = vector.reshape(h,w)
            #print(vector.shape)
#            print(vector.type())
            in_vector[j][i] = vector

            i=i+1
            if(i == padding):
                break

    #create a tensor object to return
    in_vector = torch.tensor(in_vector)

    return in_vector

In [39]:
get_input_vectors(["merhaba benim adim alicanhas", "olmaz oyle is"], 4, ft, (25,4)).shape

torch.Size([2, 4, 25, 4])

In [59]:
class CNet(Module):
    def __init__(self):
        super(CNet, self).__init__()
        
        #self.embedding_layer = Embedding.from_pretrained(weights, freeze=True, padding_idx=0)
        
        self.cnn_layers = Sequential(
            #Convolutional Layer 1 [N, C, H, W]
            #in: [N, 8, 10, 10]
            #out: [N, 10, 9, 9]
            Conv2d(4, 10, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(10),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=1),
            
            #Convolutional Layer 2
            #in: [N, 10, 9, 9]
            #out: [N, 30, 5, 5]
            Conv2d(10, 30, kernel_size=5, stride=1),
            BatchNorm2d(30),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=1),
            
            #Convolutional Layer 3
            #in: [N, 30, 4, 4]
            #out: [N, 50, 3, 3]
            Conv2d(30, 50, kernel_size=1, stride=1),
            BatchNorm2d(50),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=1),
            
            #Convolutional Layer 4
            #in: [N, 50, 3, 3]
            #out: [N, 100, 1, 1]
            Conv2d(50, 100, kernel_size=3, stride=1),
            BatchNorm2d(100),
            ReLU(inplace=True),
        )
        
        #The linear fully connected layer at the end of the network that outputs a classification
        self.out_layers = Sequential(
            Linear(100, 256),
            ReLU(),
            Linear(256, 512),
            ReLU(),
            Linear(512, 512),
            ReLU(),
            Linear(512, 256),
            ReLU(),
            Linear(256, 64),
            ReLU(),
            Linear(64, 16),
            ReLU(),
            Linear(16, 16),
            ReLU(),
            Linear(16, 1),
            Sigmoid()
        )
        
    #Define the forward pass through the model 
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.reshape((x.shape[0],x.shape[1]))
        #print("x shape:", x.shape)
        x = self.out_layers(x)
        return x
        

In [40]:
get_input_vectors(["merhaba benim adim alicanhas", "olmaz oyle is"], 4, ft, (100,1)).shape

torch.Size([2, 4, 100, 1])

In [114]:
class LNet(Module):
    def __init__(self):
        super(LNet, self).__init__()
        self.drop_prob = 0.1
        
        #self.embedding_layer = Embedding.from_pretrained(weights, freeze=True, padding_idx=0)
        
        self.cnn_layers = Sequential(
            #Convolutional Layer 1 [N, C, H, W]
            #in: [N, 4, 100, 1]
            #out: [N, 100, 100, 1]
            Conv2d(4, 100, kernel_size=1, stride=1, padding=0),
            BatchNorm2d(100),
            ReLU(inplace=True),
            
            #Convolutional Layer 4
            #in: [N, 100, 100, 1]
            #out: [N, 1, 100, 1]
            Conv2d(100, 1, kernel_size=1, stride=1),
            nn.Dropout2d(self.drop_prob),
            BatchNorm2d(1),
            ReLU(inplace=True),
        )
        
        #The linear fully connected layer at the end of the network that outputs a classification
        self.out_layers = Sequential(
            Linear(100, 256),
            #Dropout(self.drop_prob),
            BatchNorm1d(256),
            ReLU(),
            Linear(256, 512),
            BatchNorm1d(512),
            ReLU(),
            Linear(512, 1024),
            BatchNorm1d(1024),
            ReLU(),
            Linear(1024, 256),
            BatchNorm1d(256),
            ReLU(),
            Linear(256, 256),
            BatchNorm1d(256),
            ReLU(),
            Linear(256, 256),
            BatchNorm1d(256),
            ReLU(),
            Linear(256, 256),
            BatchNorm1d(256),
            ReLU(),
            Linear(256, 64),
            BatchNorm1d(64),
            ReLU(),
            Linear(64, 64),
            BatchNorm1d(64),
            ReLU(),
            Linear(64, 16),            
            BatchNorm1d(16),
            ReLU(),
            Linear(16, 1),
            Sigmoid()
        )
        
    #Define the forward pass through the model 
    def forward(self, x):
        x = self.cnn_layers(x)
        #print("x shape:", x.shape)
        x = x.reshape((x.shape[0],x.shape[2]))
        #print("x shape:", x.shape)
        x = self.out_layers(x)
        return x
        

In [244]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [227]:
def train(train_loader, model, loss, optimizer):

    model.train()

    #X, Y = Variable(X_train), Variable(Y_train)

    #if torch.cuda.is_available():
    #    X = X.cuda()
    #    Y = Y.cuda()
    
    last_loss = 0
    
    for i, data in enumerate(train_loader, 0):    

        inputs, labels = data

        optimizer.zero_grad()

        X_out = model(inputs.float())

        loss_tr = loss(X_out, labels)

        loss_tr.backward()

        optimizer.step()

        #print("Epoch: ", e, "\t", "Batch: ", i, "\t", "Loss: " + str(loss_tr) )
        #s = torch.sum(model.out_layers[4].weight.data)
        #print(s)
        
        if i == 23:
            last_loss = loss_tr
    
    s = torch.sum(model.out_layers[7].weight.data)
    
    return last_loss, s

        


In [230]:
def test(model, val_loader, loss):
    
    correct = 0
    total = 0
    test_loss = 0
    
    model = model.float()

    with torch.no_grad():
        for data in val_loader:
            
            sents, labels = data
            
            outputs = model(sents.float())
            
            test_loss = loss(outputs, labels)
            
            outputs = outputs>=0.5
            
            total += labels.size(0)
            correct += (outputs==labels).sum().item()
    
    accuracy = 100*correct/total

    #print("Accuracy on the validation set of ", total, " items is: ", (accuracy))
    
    return accuracy, test_loss
    

In [None]:

X, Y, n = read_file_to_str("train.txt")

X = get_input_vectors(X, 4, ft, (100,1))
Y = torch.tensor(Y)
Y = Y.reshape(Y.shape[0], 1)
print(Y.shape)
print(X.shape)

data = TensorDataset(X, Y)

val_perc = 7/100
n_val = int(n*val_perc)
n_tr = n - n_val

data_val, data_tr = random_split(data, [n_val, n_tr], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(data_tr, batch_size=10, shuffle = True)
val_loader = DataLoader(data_val, batch_size=n_val, shuffle=True)

print("Data_Val shape: ", n_val, "Data_tr: ", n_tr)

#define the model
model = LNet()
#model = CNet()

#optimizer
optimizer = Adam(model.parameters(), lr = 0.01)

#loss function
loss = nn.MSELoss()

if torch.cuda.is_available():
    model = model.cuda()
    loss = loss.cuda()
    

print(model)
print("Total Parameters: ", count_parameters(model))

model = model.float()

train_losses = []
train_weights = []
test_accs = []
test_losses = []


epoch = 25

for e in range(epoch):

    train_loss, weight = train(train_loader, model, loss, optimizer)
    test_acc, test_loss = test(model, val_loader, loss)
    print("Epoch: ", e, " Train Loss: %.3f" %train_loss.item(), " Train Weights: %.1f" %weight.item(), 
          " Test Loss: %.3f" %test_loss, " Test Acc: %.2f" %test_acc,)
    train_losses.append(train_loss)
    test_accs.append(test_acc)
    test_losses.append(test_loss)
    train_weights.append(weight)

plt.plot(train_losses, label="Training Losses")
plt.plot(test_losses, label="Test Losses")
plt.legend()
plt.show()

torch.Size([250, 1])
torch.Size([250, 4, 100, 1])
Data_Val shape:  17 Data_tr:  233
LNet(
  (cnn_layers): Sequential(
    (0): Conv2d(4, 100, kernel_size=(1, 1), stride=(1, 1))
    (1): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(100, 1, kernel_size=(1, 1), stride=(1, 1))
    (4): Dropout2d(p=0.1, inplace=False)
    (5): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
  )
  (out_layers): Sequential(
    (0): Linear(in_features=100, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=512, bias=True)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=1024, bias=True)
    (7): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine

In [136]:
def read_file_to_str(file_path):
    
    file = open(file_path, 'r')
    
    data_x = []
    data_y = []
    
    n = 0
    
    #Loop through the lines, and split at the comma for the inputs x and labels y
    for line in file:
        #print(line)
        temp = line.split(",")
        #print(temp)
        #train.append({'x':temp[1] , 'y': float(temp[0])})
        data_y.append(float(temp[0]))
        data_x.append(temp[1])
        n = n + 1

    file.close()
    
    return data_x, data_y, n
        

In [165]:
_, _, n = read_file_to_str("train.txt")
print('Num Train Examples: ', n)

Num Train Examples:  250
