<a href="https://colab.research.google.com/github/DerNiccoo/TheEarlyBird/blob/main/Aufgabe5/Nico/LSTM_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
!pip install -U spacy[cuda100] de
!python -m spacy download de

Collecting spacy[cuda100]
[?25l  Downloading https://files.pythonhosted.org/packages/50/b2/12466d3018bb84b039139ef76436ea7a01e98125c2aee6a81e527eb4ebe1/spacy-2.3.4-cp36-cp36m-manylinux2014_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 13.3MB/s 
[?25hCollecting de
  Downloading https://files.pythonhosted.org/packages/d7/b8/ee2e5c22775450d2a8751273a4b3cfa6e47cdb6393dd803b6317a85d213c/de-0.1.tar.gz
Collecting thinc<7.5.0,>=7.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/1b/c9/ce2e03720a5647fd90da575325376ff258653a05f357aa970fd87e6c1a55/thinc-7.4.3-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 50.4MB/s 
[?25hCollecting cupy-cuda100<8.0.0,>=5.0.0b4; extra == "cuda100"
[?25l  Downloading https://files.pythonhosted.org/packages/7f/4c/3454f4e633e721f983ba19d54983734d9276ced336178d264533f06d12a6/cupy_cuda100-7.8.0-cp36-cp36m-manylinux1_x86_64.whl (348.0MB)
[K     |████████████████████████████████| 348.

In [64]:
import pandas as pd
url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv'
df_train = pd.read_csv(url, error_bad_lines=False, sep=';', usecols=range(2), names=['labels','text'])

url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/test.csv'
df_test = pd.read_csv(url, error_bad_lines=False, sep=';', usecols=range(2), names=['labels','text'])

In [65]:
df_train = df_train.loc[(df_train['labels'] == 'Sport') | (df_train['labels'] == 'Wirtschaft')]
df_test = df_test.loc[(df_test['labels'] == 'Sport') | (df_test['labels'] == 'Wirtschaft')]

In [61]:
df_train

Unnamed: 0,labels,text
0,Sport,21-Jähriger fällt wohl bis Saisonende aus. Wie...
3,Wirtschaft,"Putin: ""Einigung, dass wir Menge auf Niveau vo..."
6,Sport,Traditionsklub setzt sich gegen den FC Utrecht...
9,Sport,Abschiedstournee für Guardiola beginnt beim HS...
10,Sport,SSC nach 5:1-Erfolg bei Robert Guchers Frosino...
...,...,...
9221,Wirtschaft,Austria Glas Recycling appelliert an Bevölkeru...
9228,Wirtschaft,"Kein Kommentar, ob Raffinerie in Schwechat ode..."
9235,Wirtschaft,Günter Geyer zieht nach wie vor die Fäden – El...
9238,Wirtschaft,Der heimische Baukonzern zieht einen Großauftr...


In [62]:
import string

valid_chars = string.ascii_letters + 'ÄÖÜäöüß-' + string.punctuation + string.digits + string.whitespace

def check_chars(row):
  for char in row:
    if char not in valid_chars:
      return True

  return False

In [63]:
df_train = df_train[df_train['text'].apply(check_chars) == True]
df_test = df_test[df_test['text'].apply(check_chars) == True]

In [66]:
import spacy
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

GPU: True


In [67]:
nlp = spacy.load('de')

In [68]:
vocab = {}

for index, row in df_train.iterrows():
  doc = nlp(row['text'])
  for token in doc:
    if token.text in vocab:
      vocab[token.text] += 1
    else:
      vocab[token.text] = 0

In [73]:
sorted_vocab = dict(sorted(vocab.items(), key=lambda item: item[1], reverse=True))

In [75]:
vocab = list(sorted_vocab)[:5000]

In [86]:
word_to_ix = {}

for index, word in enumerate(vocab):
  word_to_ix[word] = index

tag_to_ix = {"Sport": 0, "Wirtschaft": 1}

In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [142]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [185]:
def prepare_sequence(seq, to_ix):
  idxs = []

  for w in seq:
    if w in to_ix:
      idxs.append(to_ix[w])

  return torch.tensor(idxs, dtype=torch.long)

def target_tensor(target):
  tensor = torch.zeros(2, dtype=torch.long)
  tensor[target[0]] = 1
  return tensor

import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [179]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [199]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()

start = time.time()

for epoch in range(5):  # again, normally you would NOT do 300 epochs, it is toy data
    for index, row in df_train.iterrows():
      sentence = row['text'].split()
      tags = row['labels'].split()
      # Step 1. Remember that Pytorch accumulates gradients.
      # We need to clear them out before each instance
      model.zero_grad()

      # Step 2. Get our inputs ready for the network, that is, turn them into
      # Tensors of word indices.
      sentence_in = prepare_sequence(sentence, word_to_ix)
      targets = prepare_sequence(tags, tag_to_ix)

      if len(sentence_in) == 0:
        continue

      # Step 3. Run our forward pass.
      tag_scores = model(sentence_in)

      # Step 4. Compute the loss, gradients, and update the parameters by
      #  calling optimizer.step()

      target = target_tensor(targets)
      target = torch.reshape(targets, (1, 1))

      output = tag_scores[-1]
      output = torch.reshape(output, (1, output.shape[0]))

      loss = loss_function(output, targets)
      loss.backward()
      optimizer.step()

      if index % 500 == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), index, index / len(df_train) * 100, loss))

    print('%s Epoch %d / 5' % (timeSince(start), epoch + 1))

0m 0s (0 0%) 0.6752
0m 6s (2500 106%) 0.5919
0m 8s (3500 148%) 0.2074
0m 16s (6500 276%) 0.1102
0m 17s (7000 297%) 0.5245
0m 23s Epoch 1 / 5
0m 23s (0 0%) 0.1196
0m 29s (2500 106%) 0.2178
0m 32s (3500 148%) 0.0502
0m 40s (6500 276%) 0.0339
0m 41s (7000 297%) 0.0494
0m 47s Epoch 2 / 5
0m 47s (0 0%) 0.0156
0m 53s (2500 106%) 0.0343
0m 55s (3500 148%) 0.4604
1m 3s (6500 276%) 0.0098
1m 4s (7000 297%) 0.0094
1m 10s Epoch 3 / 5
1m 10s (0 0%) 0.0059
1m 16s (2500 106%) 0.0631
1m 19s (3500 148%) 0.0216
1m 26s (6500 276%) 0.0077
1m 27s (7000 297%) 0.0350
1m 33s Epoch 4 / 5
1m 33s (0 0%) 0.3078
1m 39s (2500 106%) 0.0014
1m 42s (3500 148%) 0.0717
1m 49s (6500 276%) 0.0056
1m 51s (7000 297%) 0.0026
1m 56s Epoch 5 / 5


In [192]:
cats = ['Sport', 'Wirtschaft']

def test_sentence(sentence):
  sentence_in = prepare_sequence(sentence, word_to_ix)
  output = model(sentence_in)

  _, topi = output[-1].topk(1)

  return cats[topi]

In [197]:
def test_model():

  correct = 0

  for index, row in df_test.iterrows():
    output = test_sentence(row['text'].split())
    if output == row['labels']:
      correct += 1

  print('Acc: {:.2f}'.format(correct / len(df_test) * 100))

In [198]:
test_model()

Acc: 89.27
