In [1]:
%matplotlib widget
import numpy as np
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import random

import matplotlib.pyplot as plt

import re

from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

from joblib import Parallel, delayed
import multiprocessing
from datetime import datetime

from torch.utils.data import DataLoader, Dataset

In [2]:
torch.manual_seed(1010101011)
random.seed(1010101011)

In [3]:
polskie = open("../100k.txt", encoding='utf8')
slowa = [slowo.replace("\n", "") for slowo in polskie.readlines()]
polskie.close()
print(slowa[:20])
print(len(slowa))
# slowa = slowa[:50000]

['abmicro', '527579', 'agro2011', 'nakamichi', 'dugant', '29082009', 'hannah1', 'ma12po', '580813', 'lukasz4', 'kriket', 'werohaze', 'dron14121987', 'pinot1', 'maria59', 'moltrex', 'kilim', 'Zabeczka', '7937', '7maluszek']
100000


In [4]:
chartoidx = {}

cnt = 0

longestword = 0

chartoidx["<EMPTY>"] = 0


for slowo in slowa:  
    for litera in slowo:
        if litera not in list(chartoidx.keys()):
            chartoidx[litera] = cnt + 1
            cnt = cnt + 1
            
    if len(slowo) > longestword:
        longestword = len(slowo)

In [5]:
longestword

19

In [6]:
chartoidx

{'<EMPTY>': 0,
 'a': 1,
 'b': 2,
 'm': 3,
 'i': 4,
 'c': 5,
 'r': 6,
 'o': 7,
 '5': 8,
 '2': 9,
 '7': 10,
 '9': 11,
 'g': 12,
 '0': 13,
 '1': 14,
 'n': 15,
 'k': 16,
 'h': 17,
 'd': 18,
 'u': 19,
 't': 20,
 '8': 21,
 'p': 22,
 '3': 23,
 'l': 24,
 's': 25,
 'z': 26,
 '4': 27,
 'e': 28,
 'w': 29,
 'x': 30,
 'Z': 31,
 '6': 32,
 'C': 33,
 'y': 34,
 'j': 35,
 'f': 36,
 'v': 37,
 'F': 38,
 'P': 39,
 'R': 40,
 'O': 41,
 'T': 42,
 'A': 43,
 'X': 44,
 'L': 45,
 'E': 46,
 'S': 47,
 'I': 48,
 'K': 49,
 'N': 50,
 'D': 51,
 'M': 52,
 'q': 53,
 'H': 54,
 'U': 55,
 'J': 56,
 'Q': 57,
 'G': 58,
 'W': 59,
 'B': 60,
 'V': 61,
 'Y': 62,
 '@': 63,
 '.': 64,
 '#': 65,
 '!': 66,
 ';': 67,
 '*': 68,
 '`': 69,
 '_': 70,
 '-': 71,
 '^': 72,
 ' ': 73,
 ',': 74,
 '%': 75,
 '$': 76,
 '/': 77,
 '[': 78,
 ']': 79,
 '=': 80,
 '+': 81}

In [7]:
class MyDataset(Dataset):
    def __init__(self, slowa, chartoidx, longestword):
        self.slowa = slowa
        self.chartoidx = chartoidx
        self.longestword = longestword
        
    def __len__(self):
        return len(self.slowa)
        
    def __getitem__(self, index):
        literyx = []
        literyy = []
        
        slowo = self.slowa[index]
        
        for cnt in range(self.longestword):
            if cnt < (len(slowo) - 1):
                literyx.append(chartoidx[slowo[cnt]])
            else:
                literyx.append(chartoidx["<EMPTY>"])
                
        return np.array(literyx, dtype="float32"), np.array([1], dtype="long")

In [8]:
md = MyDataset(slowa, chartoidx, longestword)

In [9]:
BS = 10000

In [10]:
md.__getitem__(0)

(array([1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=float32), array([1]))

In [11]:
dataloader = DataLoader(dataset=md, batch_size=BS, num_workers=0)

In [12]:
class Discriminator(nn.Module):
    def __init__(self, vocabsize, longestword):
        super(Discriminator, self).__init__()
        
        ## WARSTWY
        self.embd = nn.Embedding(vocabsize, 2*vocabsize)
        self.ins1 = nn.Linear(2*vocabsize*longestword, 2*vocabsize*longestword)
        self.hid1 = nn.Linear(2*vocabsize*longestword, vocabsize)
        self.out1 = nn.Linear(vocabsize, 1)
        
        self.norm1 = nn.BatchNorm1d(2*vocabsize*longestword)
        self.norm2 = nn.BatchNorm1d(vocabsize)
        self.drop = nn.Dropout(p=0.1)
        
    def forward(self, x):
        y = self.embd(x).view(len(x), -1)
        
        # LAYER 1
        y = self.ins1(y)
        y = F.relu(y)
        y = self.norm1(y)
        y = self.drop(y)
        
        # LAYER 2
        y = self.hid1(y)
        y = F.relu(y)
        y = self.norm2(y)
        y = self.drop(y)
        
        # LAYER OUT
        y = self.out1(y)
        y = torch.sigmoid(y)
        
        return y

In [13]:
class Generator(nn.Module):
    def __init__(self, vocabsize, inputs, longestword):
        super(Generator, self).__init__()
        
        ## WARSTWY
        self.ins1 = nn.Linear(in_features=inputs, out_features=inputs)
        self.hid1 = nn.Linear(in_features=inputs, out_features=vocabsize)
        self.out1 = nn.Linear(in_features=vocabsize, out_features=vocabsize*longestword)
        
        self.norm1 = nn.BatchNorm1d(inputs)
        self.norm2 = nn.BatchNorm1d(vocabsize)
        self.norm3 = nn.BatchNorm1d(vocabsize*longestword)
        
        self.drop = nn.Dropout(p=0.1)
    
    def forward(self, x):
        # LAYER 1
        y = self.ins1(x)
        y = F.relu(y)
        y = self.norm1(y)
        y = self.drop(y)
        
        # LAYER 2
        y = self.hid1(y)
        y = F.relu(y)
        y = self.norm2(y)
        y = self.drop(y)
        
        # LAYER 3
        y = self.out1(y)
        y = F.relu(y)
        y = self.norm3(y)
        
        y = y.view(-1, longestword, vocabsize)
        
        y = F.log_softmax(y, dim=2)
        
        y = torch.argmax(y, dim=2)
        
        return y

In [14]:
vocabsize = len(list(chartoidx.keys()))
inputs = 10

In [15]:
generator = Generator(vocabsize, inputs, longestword).cuda()
discriminator = Discriminator(vocabsize, longestword).cuda()

In [16]:
criterion = nn.BCELoss()

In [17]:
optimizerGenerator = optim.Adam(generator.parameters(), lr=3e-3)
optimizerClassifier = optim.Adam(discriminator.parameters(), lr=3e-3)

In [18]:
epochs=100000

lossG = []
lossD = []

for epoch in range(epochs):
    for batch in dataloader:
        x = batch[0].long().cuda()
        yreal = batch[1].float().cuda()
        
        d1, d2 = x.shape
        
        # GENEROWANIE FAKE
#         generator.eval()
        seed = torch.randn((d1, inputs)).cuda()
        xfake = generator(seed)
        yfake = torch.zeros((d1, 1)).cuda()
        
        # Wektor dla discriminatora
        TensorIns = torch.cat((x, xfake), 0).cuda()
        TensorOuts = torch.cat((yreal, yfake), ).cuda()
        
        # UCZENIE WLASCIWE
        ## DISCRIMINATOR 
        discriminator.train()
        optimizerClassifier.zero_grad()
        
        y1 = discriminator(TensorIns)
        lossDiscriminator = criterion(y1, TensorOuts)
        
        lossDiscriminator.backward()
        optimizerClassifier.step()
        
        ## GENERATOR
        generator.train()
        optimizerGenerator.zero_grad()
        
#         seed = torch.randn((d1, inputs)).cuda()
#         y3_ = generator(seed)
        y3 = discriminator(xfake)
        lossGenerator = criterion(y3, yreal)
        
        lossGenerator.backward()
        optimizerGenerator.step()
        
        lossG.append(lossGenerator.item())
        lossD.append(lossDiscriminator.item())
        
#     if epoch%10==0: 
    print("Epoch {}/{}, Loss Generator: {:.8f}, Loss Discriminator: {:.8f}".format(epoch+1, epochs, lossGenerator.item(), lossDiscriminator.item()))

Epoch 1/100000, Loss Generator: 1.28624380, Loss Discriminator: 0.04138226
Epoch 2/100000, Loss Generator: 1.54828930, Loss Discriminator: 0.01518707
Epoch 3/100000, Loss Generator: 1.72527993, Loss Discriminator: 0.00712650
Epoch 4/100000, Loss Generator: 1.82836831, Loss Discriminator: 0.00414078
Epoch 5/100000, Loss Generator: 1.88018823, Loss Discriminator: 0.00288166
Epoch 6/100000, Loss Generator: 1.95093536, Loss Discriminator: 0.00222193
Epoch 7/100000, Loss Generator: 1.99317622, Loss Discriminator: 0.00179366
Epoch 8/100000, Loss Generator: 2.03887177, Loss Discriminator: 0.00149681
Epoch 9/100000, Loss Generator: 2.07453465, Loss Discriminator: 0.00128852
Epoch 10/100000, Loss Generator: 2.12652922, Loss Discriminator: 0.00110826
Epoch 11/100000, Loss Generator: 2.16471386, Loss Discriminator: 0.00097643
Epoch 12/100000, Loss Generator: 2.18334699, Loss Discriminator: 0.00086469
Epoch 13/100000, Loss Generator: 2.22488546, Loss Discriminator: 0.00077193
Epoch 14/100000, Loss

KeyboardInterrupt: 

In [19]:
fix, ax = plt.subplots()
ax.plot([x for x in range(len(lossG))], lossG, ".", [x for x in range(len(lossD))], lossD, ".")

FigureCanvasNbAgg()

[<matplotlib.lines.Line2D at 0x1f3142905f8>,
 <matplotlib.lines.Line2D at 0x1f30a8d0320>]

In [20]:
with torch.no_grad():
    generator.eval()
    
    seed = torch.randn((d1, inputs)).cuda()
    y3_ = generator(seed)
    
    
    slowa = []
    
    for slowo in y3.cpu().detach().numpy():
        slw = ""
        
        for index in slowo:
            slw = slw + list(chartoidx.keys())[int(index)]
            
        slowa.append(slw)
    
    print(slowa)

['<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>', '<EMPTY>'