<a href="https://colab.research.google.com/github/CoolCodeMan/intro2LT/blob/lauri/pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Links**

* https://github.com/CoolCodeMan/intro2LT/blob/lauri/tfidf-bow-ja-linearSVC.ipynb
* https://github.com/graykode/nlp-tutorial
* https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-1.NNLM/NNLM.ipynb
* https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
* https://stackoverflow.com/questions/50747947/embedding-in-pytorch

In [1]:
!wget -q -O anger-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/train.tsv
!wget -q -O anger-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/test.tsv
!wget -q -O anger-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/dev.tsv

!wget -q -O joy-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/train.tsv
!wget -q -O joy-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/test.tsv
!wget -q -O joy-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/dev.tsv

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim

In [5]:
class Net(nn.Module):
  def __init__(self, embed_size, vector_size, context_size):
    super(Net, self).__init__()

    self.layer1 = nn.Embedding(embed_size, vector_size)
    self.layer2 = nn.Linear(vector_size * context_size, 128)
    self.layer3 = nn.Linear(128, 3)

  def forward(self, X):
    out = self.layer1(X)
    out = self.layer2(out)
    # sigmoid or relu
    out = functional.sigmoid(out)
    out = self.layer3(out)
    # sigmoid or relu
    out = functional.sigmoid(out)
    return out

In [6]:
import pandas as pd

In [8]:
joy = pd.read_csv('/content/joy-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
anger = pd.read_csv('/content/anger-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')

In [20]:
dataframe = pd.concat([joy, anger])
dataframe['emotion'].replace({'not-joy':'neutral', 'not-anger':'neutral'}, inplace=True)
# dataframe['emotion'].replace({'not-joy':0, 'not-anger':0, 'joy':1, 'anger':2}, inplace=True)

In [22]:
import sklearn.utils

def shuffle(data):
  data = sklearn.utils.shuffle(data)
  data.reset_index(inplace=True, drop=True)
  return data

In [23]:
dataframe = shuffle(dataframe)
dataframe.tail()

Unnamed: 0,emotion,text
2020,neutral,"Älä seuraa kun minä karkaan tästä maailmasta ,..."
2021,neutral,En halua kuluttaa liikaa aikaa.
2022,neutral,Olinhan tehnyt tytön olon aika epämukavaksi.
2023,anger,Miksi sinä saisit ne?
2024,joy,"Kultaseni, olet niin hyvä minulle."


In [40]:
words = []
for x in dataframe['text']:
  l = x.split(' ')
  for y in l:
    words.append(y)

len(words)

19345

In [41]:
from collections import Counter

In [43]:
vocab = Counter(words)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
vocab_size

9597

In [55]:
word2vec = {word: ind for ind, word in enumerate(vocab)}
list(word2vec)[:20]

['on',
 ',',
 'ja',
 '.',
 'että',
 'ei',
 'se',
 'oli',
 'ole',
 'mutta',
 'kun',
 'niin',
 'kuin',
 'En',
 '-',
 'sen',
 'jos',
 'ovat',
 'olla',
 'Se']

In [56]:
encoded_sentences_example = [word2vec[word] for word in words]
print(encoded_sentences_example)

[36, 1980, 1981, 1982, 854, 70, 135, 553, 1983, 2, 1984, 155, 1985, 1986, 1987, 17, 1988, 81, 855, 65, 1989, 272, 1990, 1991, 1992, 1993, 1994, 1995, 221, 1996, 6, 1997, 416, 221, 1998, 1999, 2000, 0, 71, 2001, 2002, 2003, 18, 2004, 2005, 417, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 554, 273, 2015, 2016, 7, 2017, 2018, 555, 0, 122, 2019, 2020, 31, 0, 418, 1, 9, 2021, 25, 47, 2022, 3, 2023, 419, 0, 2024, 2025, 1, 2026, 5, 420, 2027, 1, 2028, 0, 856, 1, 96, 2029, 856, 3, 421, 556, 4, 274, 0, 557, 2030, 2, 2031, 7, 2032, 558, 559, 2033, 0, 11, 857, 175, 222, 193, 560, 2034, 10, 858, 72, 223, 859, 561, 14, 2, 2035, 2036, 2037, 45, 2038, 2039, 22, 7, 44, 562, 2040, 2041, 2042, 3, 14, 860, 422, 18, 861, 79, 224, 563, 2043, 97, 2044, 862, 4, 863, 275, 2045, 564, 89, 123, 565, 864, 2, 423, 2046, 865, 2047, 225, 3, 866, 2048, 194, 62, 2049, 1, 4, 51, 2050, 2051, 1, 2052, 566, 2053, 2054, 2055, 2, 2056, 2057, 567, 48, 1, 0, 867, 2058, 1, 2059, 2, 2060, 868, 2061, 2062, 49, 111, 424

In [61]:
# [joy, anger, neutral]

labels = dataframe['emotion']
y = []
for i in labels:
  if i == 'joy':
    y.append([1,0,0])
  elif i == 'anger':
    y.append([0,1,0])
  elif i == 'neutral':
    y.append([0,0,1])

In [62]:
vec_size = 3
context_s = 2
model = Net(vocab_size, vec_size, context_s)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [71]:
"""---------------------------testing-------------------------------------"""
sentences = dataframe['text']
# sentence = sentences[0].split(' ')
encoded_sentence = [word2vec[word] for word in sentences[0].split(' ')]
print(encoded_sentence)
"""-----------------------------------------------------------------------"""

[36, 1980, 1981, 1982, 854, 70, 135, 553, 1983, 2, 1984, 155, 1985]


In [73]:
sentences = dataframe['text']
for epoch in range(2000):
  encoded_sentence = [word2vec[word] for word in sentences[epoch].split(' ')]