<a href="https://colab.research.google.com/github/CoolCodeMan/intro2LT/blob/lauri/pytorch-embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import warnings
warnings.filterwarnings('ignore')

# **Links**

* https://github.com/CoolCodeMan/intro2LT/blob/lauri/tfidf-bow-ja-linearSVC.ipynb
* https://github.com/graykode/nlp-tutorial
* https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-1.NNLM/NNLM.ipynb
* https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
* https://stackoverflow.com/questions/50747947/embedding-in-pytorch

In [4]:
!wget -q -O anger-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/train.tsv
!wget -q -O anger-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/test.tsv
!wget -q -O anger-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/dev.tsv

!wget -q -O joy-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/train.tsv
!wget -q -O joy-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/test.tsv
!wget -q -O joy-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/dev.tsv

In [5]:
import pandas as pd

In [6]:
joy = pd.read_csv('/content/joy-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
anger = pd.read_csv('/content/anger-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')

In [7]:
dataframe = pd.concat([joy, anger])
dataframe['emotion'].replace({'not-joy':'neutral', 'not-anger':'neutral'}, inplace=True)
# dataframe['emotion'].replace({'not-joy':0, 'not-anger':0, 'joy':1, 'anger':2}, inplace=True)

In [8]:
import sklearn.utils

def shuffle(data):
  data = sklearn.utils.shuffle(data)
  data.reset_index(inplace=True, drop=True)
  return data

In [9]:
dataframe = shuffle(dataframe)
dataframe.tail()

Unnamed: 0,emotion,text
2020,anger,Mitä vittua te idiootti viuhdotte .
2021,neutral,"Jumiudumme tilanteisiin , joissa tiedostamme o..."
2022,neutral,"Hyvää huomenta , päivää ja iltaa ."
2023,neutral,Täällä on niin hiljaista lähtösi jälkeen .
2024,neutral,"Minusta olisi kauheaa, jos toinen olisi hoikka..."


In [10]:
words = []
for x in dataframe['text']:
  l = x.split(' ')
  for y in l:
    words.append(y)

len(words)

19345

In [11]:
from collections import Counter

In [12]:
vocab = Counter(words)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
vocab_size

9597

In [13]:
# word2vec = {word: ind for ind, word in enumerate(vocab)}
word2vec = {word: ind + 1 for ind, word in enumerate(vocab)}
list(word2vec)[:20]

['on',
 ',',
 'ja',
 '.',
 'että',
 'ei',
 'se',
 'oli',
 'ole',
 'mutta',
 'kun',
 'niin',
 'kuin',
 'En',
 '-',
 'sen',
 'jos',
 'olla',
 'ovat',
 'Se']

In [14]:
encoded_sentences_example = [word2vec[word] for word in words]
print(list(encoded_sentences_example)[:20])

[855, 554, 856, 857, 3, 555, 82, 556, 858, 859, 860, 1, 861, 136, 862, 222, 52, 1981, 1982, 863]


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim

In [17]:
# [joy, anger, neutral]

labels = dataframe['emotion']
y = []
for i in labels:
  if i == 'joy':
    y.append([1,0,0])
  elif i == 'anger':
    y.append([0,1,0])
  elif i == 'neutral':
    y.append([0,0,1])
y = torch.tensor(y).float()

In [18]:
import numpy as np

In [19]:
def score(model, dataset, labels, dataset_size=2024):
  total = 0
  for i in range(dataset_size):
    encoded_sentence = torch.tensor([word2vec[word] for word in dataset[i].split(' ')])
    pred = model(encoded_sentence)
    n1 = pred.detach().numpy().argmax()
    n2 = y[i].detach().numpy().argmax()
    if n2 == n1:
      total += 1
    
  print("TRAIN {:.2f}".format(total/dataset_size*100), '%')

In [32]:
class Net(nn.Module):
  def __init__(self, embed_size, vector_size, context_size):
    super(Net, self).__init__()

    # self.embed = nn.EmbeddingBag(embed_size, vector_size, sparse=True)
    self.embed = nn.Embedding(embed_size, vector_size, padding_idx=0, sparse=True)
    self.embed.weight.requires_grad = True

    self.conv1 = nn.Conv1d(128, 50, kernel_size=1)
    self.conv2 = nn.Conv1d(50, 1, kernel_size=1)
    
    self.fc1 = nn.Linear(vector_size, 50)
    self.fc2 = nn.Linear(50, context_size)

  def forward(self, X):

    p = int((128-X.shape[0])/2)
    if X.shape[0] % 2 == 0:
      out = functional.pad(X, pad=(p,p))
    else:
      out = functional.pad(X, pad=(p,p+1))

    out = self.embed(out)

    out = out.clone().detach()
    out = out.float()
    out.requires_grad = True

    out = out.view(out.shape[1],out.shape[0],1)
    out = self.conv1(out)

    out = self.conv2(out)

    out = out.view(out.shape[0],out.shape[1])
    out = torch.transpose(out,0,1)
    
    out = self.fc1(out)

    out = torch.sigmoid(out)
    out = self.fc2(out)

    out = out.view(out.shape[1])

    return out

In [33]:
vec_size = 50
context_s = 3
model = Net(vocab_size, vec_size, context_s)

loss_function = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9)

In [34]:
sentences = dataframe['text']
for i in range(2):
  running_loss = 0.0
  for epoch in range(2024):
    encoded_sentence = torch.tensor([word2vec[word] for word in sentences[epoch].split(' ')])
    model.zero_grad()

    # out = model(encoded_sentence, torch.tensor([0]))
    out = model(encoded_sentence)
    loss = loss_function(out, y[epoch])

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if epoch % 100 == 0:
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2024))
      # score(model, sentences[:50], y, dataset_size=50)
      # running_loss = 0.0

[1,     1] loss: 0.000
[101,     1] loss: 0.013
[201,     1] loss: 0.022
[301,     1] loss: 0.032
[401,     1] loss: 0.041
[501,     1] loss: 0.049
[601,     1] loss: 0.058
[701,     1] loss: 0.067
[801,     1] loss: 0.076
[901,     1] loss: 0.084
[1001,     1] loss: 0.093
[1101,     1] loss: 0.102
[1201,     1] loss: 0.110
[1301,     1] loss: 0.120
[1401,     1] loss: 0.129
[1501,     1] loss: 0.138
[1601,     1] loss: 0.146
[1701,     1] loss: 0.154
[1801,     1] loss: 0.163
[1901,     1] loss: 0.173
[2001,     1] loss: 0.180
[1,     2] loss: 0.000
[101,     2] loss: 0.010
[201,     2] loss: 0.019
[301,     2] loss: 0.029
[401,     2] loss: 0.038
[501,     2] loss: 0.046
[601,     2] loss: 0.054
[701,     2] loss: 0.063
[801,     2] loss: 0.072
[901,     2] loss: 0.080
[1001,     2] loss: 0.090
[1101,     2] loss: 0.098
[1201,     2] loss: 0.106
[1301,     2] loss: 0.116
[1401,     2] loss: 0.125
[1501,     2] loss: 0.134
[1601,     2] loss: 0.142
[1701,     2] loss: 0.150
[1801,    

In [35]:
score(model, sentences, y)

TRAIN 68.97 %
