<a href="https://colab.research.google.com/github/CoolCodeMan/intro2LT/blob/lauri/pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
import warnings
warnings.filterwarnings('ignore')

# **Links**

* https://github.com/CoolCodeMan/intro2LT/blob/lauri/tfidf-bow-ja-linearSVC.ipynb
* https://github.com/graykode/nlp-tutorial
* https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-1.NNLM/NNLM.ipynb
* https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
* https://stackoverflow.com/questions/50747947/embedding-in-pytorch

In [155]:
!wget -q -O anger-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/train.tsv
!wget -q -O anger-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/test.tsv
!wget -q -O anger-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/dev.tsv

!wget -q -O joy-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/train.tsv
!wget -q -O joy-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/test.tsv
!wget -q -O joy-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/dev.tsv

In [156]:
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim

In [231]:
class Net(nn.Module):
  def __init__(self, embed_size, vector_size, context_size):
    super(Net, self).__init__()

    self.layer1 = nn.EmbeddingBag(embed_size, vector_size, sparse=True)
    # self.layer2 = nn.Linear(vector_size * context_size, 128)
    # self.layer2 = nn.Linear(vector_size, 128)
    self.layer2 = nn.Linear(vector_size, context_size)
    # self.layer4 = nn.Linear(3, 3)

  def forward(self, X, offset):
    out = self.layer1(X, offset)
    out = self.layer2(out)
    # out = torch.sigmoid(out)
    return out

In [158]:
import pandas as pd

In [159]:
joy = pd.read_csv('/content/joy-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
anger = pd.read_csv('/content/anger-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')

In [160]:
dataframe = pd.concat([joy, anger])
dataframe['emotion'].replace({'not-joy':'neutral', 'not-anger':'neutral'}, inplace=True)
# dataframe['emotion'].replace({'not-joy':0, 'not-anger':0, 'joy':1, 'anger':2}, inplace=True)

In [161]:
import sklearn.utils

def shuffle(data):
  data = sklearn.utils.shuffle(data)
  data.reset_index(inplace=True, drop=True)
  return data

In [162]:
dataframe = shuffle(dataframe)
dataframe.tail()

Unnamed: 0,emotion,text
2020,neutral,"En koskaan tiedä etukäteen, milloin olen nero."
2021,joy,Täällä on hevonenkin valmiina.
2022,neutral,"Se kertoisi sen, mitä tiedämme, kertomalla sen..."
2023,neutral,"Niinpä. Valamiehistö muistaa kuvat, joissa Car..."
2024,neutral,Usko Jumalaan on ihmisen normaali ja terve elä...


In [163]:
words = []
for x in dataframe['text']:
  l = x.split(' ')
  for y in l:
    words.append(y)

len(words)

19345

In [164]:
from collections import Counter

In [165]:
vocab = Counter(words)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
vocab_size

9597

In [166]:
word2vec = {word: ind for ind, word in enumerate(vocab)}
list(word2vec)[:20]

['on',
 ',',
 'ja',
 '.',
 'että',
 'ei',
 'se',
 'oli',
 'ole',
 'mutta',
 'kun',
 'niin',
 'kuin',
 'En',
 '-',
 'sen',
 'jos',
 'olla',
 'ovat',
 'joka']

In [167]:
encoded_sentences_example = [word2vec[word] for word in words]
print(list(encoded_sentences_example)[:20])

[29, 416, 1980, 193, 1981, 854, 855, 13, 32, 1982, 272, 36, 553, 1983, 1984, 1985, 122, 37, 856, 1986]


In [269]:
# [joy, anger, neutral]

labels = dataframe['emotion']
y = []
for i in labels:
  if i == 'joy':
    y.append([1,0,0])
  elif i == 'anger':
    y.append([0,1,0])
  elif i == 'neutral':
    y.append([0,0,1])
y = torch.tensor(y).float()

In [312]:
vec_size = 200
context_s = 3
model = Net(vocab_size, vec_size, context_s)

loss_function = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=0.005)

In [314]:
sentences = dataframe['text']
for i in range(10):
  running_loss = 0.0
  for epoch in range(2024):
    encoded_sentence = torch.tensor([word2vec[word] for word in sentences[epoch].split(' ')])
    model.zero_grad()

    out = model(encoded_sentence, torch.tensor([0]))
    loss = loss_function(out, y[epoch])

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if epoch % 100 == 0:
      # print(epoch, loss.item())
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
      # running_loss = 0.0

[1,     1] loss: 0.000
[101,     1] loss: 0.006
[201,     1] loss: 0.013
[301,     1] loss: 0.019
[401,     1] loss: 0.026
[501,     1] loss: 0.033
[601,     1] loss: 0.042
[701,     1] loss: 0.050
[801,     1] loss: 0.058
[901,     1] loss: 0.065
[1001,     1] loss: 0.072
[1101,     1] loss: 0.080
[1201,     1] loss: 0.087
[1301,     1] loss: 0.094
[1401,     1] loss: 0.101
[1501,     1] loss: 0.107
[1601,     1] loss: 0.114
[1701,     1] loss: 0.121
[1801,     1] loss: 0.128
[1901,     1] loss: 0.136
[2001,     1] loss: 0.142
[1,     2] loss: 0.000
[101,     2] loss: 0.006
[201,     2] loss: 0.013
[301,     2] loss: 0.019
[401,     2] loss: 0.026
[501,     2] loss: 0.033
[601,     2] loss: 0.042
[701,     2] loss: 0.050
[801,     2] loss: 0.058
[901,     2] loss: 0.065
[1001,     2] loss: 0.072
[1101,     2] loss: 0.080
[1201,     2] loss: 0.087
[1301,     2] loss: 0.094
[1401,     2] loss: 0.101
[1501,     2] loss: 0.107
[1601,     2] loss: 0.114
[1701,     2] loss: 0.120
[1801,    

In [315]:
import numpy as np

In [316]:
size = 2024
total = 0
for i in range(size):
  encoded_sentence = torch.tensor([word2vec[word] for word in sentences[i].split(' ')])
  pred = model(encoded_sentence, torch.tensor([0]))
  n1 = pred.detach().numpy().argmax()
  n2 = y[i].detach().numpy().argmax()
  if n2 == n1:
    total += 1

print("TRAIN {:.2f}".format(total/size*100), '%')

TRAIN 73.47 %
