<a href="https://colab.research.google.com/github/DerNiccoo/TheEarlyBird/blob/main/Aufgabe5/Nico/LSTM_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U spacy[cuda100] de
!python -m spacy download de

Requirement already up-to-date: spacy[cuda100] in /usr/local/lib/python3.6/dist-packages (2.3.4)
Requirement already up-to-date: de in /usr/local/lib/python3.6/dist-packages (0.1)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv'
df_train = pd.read_csv(url, error_bad_lines=False, sep=';', usecols=range(2), names=['labels','text'])

url = 'https://raw.githubusercontent.com/tblock/10kGNAD/master/test.csv'
df_test = pd.read_csv(url, error_bad_lines=False, sep=';', usecols=range(2), names=['labels','text'])

In [3]:
df_train = df_train.loc[(df_train['labels'] == 'Sport') | (df_train['labels'] == 'Wirtschaft')]
df_test = df_test.loc[(df_test['labels'] == 'Sport') | (df_test['labels'] == 'Wirtschaft')]

In [4]:
df_train

Unnamed: 0,labels,text
0,Sport,21-Jähriger fällt wohl bis Saisonende aus. Wie...
3,Wirtschaft,"Putin: ""Einigung, dass wir Menge auf Niveau vo..."
5,Wirtschaft,Der Welser Stempelhersteller verbreitert sich ...
6,Sport,Traditionsklub setzt sich gegen den FC Utrecht...
9,Sport,Abschiedstournee für Guardiola beginnt beim HS...
...,...,...
9221,Wirtschaft,Austria Glas Recycling appelliert an Bevölkeru...
9228,Wirtschaft,"Kein Kommentar, ob Raffinerie in Schwechat ode..."
9235,Wirtschaft,Günter Geyer zieht nach wie vor die Fäden – El...
9238,Wirtschaft,Der heimische Baukonzern zieht einen Großauftr...


In [5]:
import string

valid_chars = string.ascii_letters + 'ÄÖÜäöüß–' + string.punctuation + string.digits + string.whitespace

def check_chars(row):
  for char in row:
    if char not in valid_chars:
      return True

  return False

In [6]:
df_train = df_train[df_train['text'].apply(check_chars) == False]
df_test = df_test[df_test['text'].apply(check_chars) == False]

In [7]:
df_train

Unnamed: 0,labels,text
0,Sport,21-Jähriger fällt wohl bis Saisonende aus. Wie...
3,Wirtschaft,"Putin: ""Einigung, dass wir Menge auf Niveau vo..."
5,Wirtschaft,Der Welser Stempelhersteller verbreitert sich ...
6,Sport,Traditionsklub setzt sich gegen den FC Utrecht...
9,Sport,Abschiedstournee für Guardiola beginnt beim HS...
...,...,...
9221,Wirtschaft,Austria Glas Recycling appelliert an Bevölkeru...
9228,Wirtschaft,"Kein Kommentar, ob Raffinerie in Schwechat ode..."
9235,Wirtschaft,Günter Geyer zieht nach wie vor die Fäden – El...
9238,Wirtschaft,Der heimische Baukonzern zieht einen Großauftr...


In [8]:
import spacy
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

GPU: True


In [9]:
nlp = spacy.load('de')

In [10]:
vocab = {}

for index, row in df_train.iterrows():
  doc = nlp(row['text'])
  for token in doc:
    if token.text in vocab:
      vocab[token.text] += 1
    else:
      vocab[token.text] = 0

In [11]:
sorted_vocab = dict(sorted(vocab.items(), key=lambda item: item[1], reverse=True))

In [12]:
vocab = list(sorted_vocab)[:5000]

In [13]:
word_to_ix = {}

for index, word in enumerate(vocab):
  word_to_ix[word] = index

tag_to_ix = {"Sport": 0, "Wirtschaft": 1}

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [16]:
def prepare_sequence(seq, to_ix):
  idxs = []

  for w in seq:
    if w in to_ix:
      idxs.append(to_ix[w])

  return torch.tensor(idxs, dtype=torch.long)

def target_tensor(target):
  tensor = torch.zeros(2, dtype=torch.long)
  tensor[target[0]] = 1
  return tensor

import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [17]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [18]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()

start = time.time()

count = 0
for epoch in range(5):  # again, normally you would NOT do 300 epochs, it is toy data
    count = 0
    for index, row in df_train.iterrows():
      sentence = row['text'].split()
      tags = row['labels'].split()
      # Step 1. Remember that Pytorch accumulates gradients.
      # We need to clear them out before each instance
      model.zero_grad()

      # Step 2. Get our inputs ready for the network, that is, turn them into
      # Tensors of word indices.
      sentence_in = prepare_sequence(sentence, word_to_ix)
      targets = prepare_sequence(tags, tag_to_ix)

      if len(sentence_in) == 0:
        print("skipped")
        continue

      # Step 3. Run our forward pass.
      tag_scores = model(sentence_in)

      # Step 4. Compute the loss, gradients, and update the parameters by
      #  calling optimizer.step()

      target = target_tensor(targets)
      target = torch.reshape(targets, (1, 1))

      output = tag_scores[-1]
      output = torch.reshape(output, (1, output.shape[0]))

      loss = loss_function(output, targets)
      loss.backward()
      optimizer.step()

      if count % 500 == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), count, count / len(df_train) * 100, loss))
      count += 1

    print('%s Epoch %d / 5' % (timeSince(start), epoch + 1))

0m 0s (0 0%) 0.6847
skipped
0m 4s (500 23%) 0.9015
0m 9s (1000 46%) 0.3120
0m 15s (1500 69%) 0.2233
0m 20s (2000 92%) 0.4414
0m 22s Epoch 1 / 5
0m 25s (2500 115%) 0.0686
skipped
0m 30s (3000 138%) 0.0180
0m 35s (3500 161%) 0.4277
0m 40s (4000 184%) 0.7564
0m 43s Epoch 2 / 5
0m 45s (4500 208%) 0.0395
skipped
0m 50s (5000 231%) 0.1754
0m 55s (5500 254%) 1.2956
1m 0s (6000 277%) 0.4596
1m 5s Epoch 3 / 5
1m 5s (6500 300%) 0.1060
skipped
1m 10s (7000 323%) 0.0001
1m 15s (7500 346%) 0.0000
1m 20s (8000 369%) 0.0000
1m 26s (8500 392%) 0.2929
1m 27s Epoch 4 / 5
1m 31s (9000 416%) 0.0073
skipped
1m 36s (9500 439%) 0.2149
1m 41s (10000 462%) 0.0028
1m 46s (10500 485%) 0.0001
1m 49s Epoch 5 / 5


In [22]:
cats = ['Sport', 'Wirtschaft']

def test_sentence(sentence):
  sentence_in = prepare_sequence(sentence, word_to_ix)
  output = model(sentence_in)

  _, topi = output[-1].topk(1)

  return cats[topi]

In [23]:
def test_model():

  correct = 0

  for index, row in df_test.iterrows():
    output = test_sentence(row['text'].split())
    if output == row['labels']:
      correct += 1

  print('Acc: {:.2f}'.format(correct / len(df_test) * 100))

In [24]:
test_model()

Acc: 85.37
