<a href="https://colab.research.google.com/github/CoolCodeMan/intro2LT/blob/lauri/pytorch-bow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings('ignore')

# **Links**

* https://github.com/CoolCodeMan/intro2LT/blob/lauri/tfidf-bow-ja-linearSVC.ipynb
* https://github.com/graykode/nlp-tutorial
* https://colab.research.google.com/github/graykode/nlp-tutorial/blob/master/1-1.NNLM/NNLM.ipynb
* https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
* https://stackoverflow.com/questions/50747947/embedding-in-pytorch

In [1]:
!wget -q -O anger-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/train.tsv
!wget -q -O anger-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/test.tsv
!wget -q -O anger-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/anger-annotation/dev.tsv

!wget -q -O joy-train.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/train.tsv
!wget -q -O joy-test.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/test.tsv
!wget -q -O joy-dev.tsv http://dl.turkunlp.org/TKO_8966_2021-projects/joy-annotation/dev.tsv

In [3]:
import pandas as pd

In [4]:
joy = pd.read_csv('/content/joy-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
anger = pd.read_csv('/content/anger-train.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')

In [5]:
dataframe = pd.concat([joy, anger])
dataframe['emotion'].replace({'not-joy':'neutral', 'not-anger':'neutral'}, inplace=True)
# dataframe['emotion'].replace({'not-joy':0, 'not-anger':0, 'joy':1, 'anger':2}, inplace=True)

In [6]:
import sklearn.utils

def shuffle(data):
  data = sklearn.utils.shuffle(data)
  data.reset_index(inplace=True, drop=True)
  return data

In [7]:
dataframe = shuffle(dataframe)
dataframe.tail()

Unnamed: 0,emotion,text
2020,joy,"Onpa mukava , että sait taas viran ."
2021,anger,Ovi auki - kusipää!
2022,joy,"Ihanaa , arki koitti ! :)"
2023,anger,"Sanakin vielä, ja olet mennyttä!"
2024,neutral,"Se on liian kallis, Leslie!"


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
import numpy as np

In [10]:
vectorizer=CountVectorizer(analyzer='word', ngram_range=(1,2))

vectorizer.fit(dataframe['text'])
feature_matrix = vectorizer.transform(dataframe['text'])

print("shape=",feature_matrix.shape)

shape= (2025, 22785)


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim

In [12]:
from scipy.sparse import *

In [13]:
mat = feature_matrix.astype('float').toarray()
dataset = torch.from_numpy(mat)
# dataset.requires_grad = True
dataset

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [14]:
def make_labels(labels):
  y = []
  for i in labels:
    if i == 'joy':
      y.append([1,0,0])
    elif i == 'anger':
      y.append([0,1,0])
    elif i == 'neutral':
      y.append([0,0,1])
  y = torch.tensor(y).float()
  return y

In [15]:
y = make_labels(dataframe['emotion'])

In [16]:
def score(model, dataset, labels, dataset_size=2024):
  total = 0
  for i in range(dataset_size):
    pred = model(dataset[i].float())
    n1 = pred.detach().numpy().argmax()
    n2 = y[i].detach().numpy().argmax()
    if n2 == n1:
      total += 1
    
  print("TRAIN {:.2f}".format(total/dataset_size*100), '%')

In [17]:
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()

    self.fc1 = nn.Linear(22785, 10)
    self.fc2 = nn.Linear(10, 3)

  def forward(self, X):

    out = self.fc1(X)
    out = torch.sigmoid(out)

    out = self.fc2(out)
    # out = torch.sigmoid(out)

    return out

In [18]:
model = Net()
loss_function = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.2)

In [19]:
for i in range(4):
  for epoch in range(2025):
    model.zero_grad()

    out = model(dataset[epoch].float())
    loss = loss_function(out, y[epoch])

    loss.backward()
    optimizer.step()

    if epoch % 100 == 99:
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, loss.item()))

[100,     1] loss: 0.448
[200,     1] loss: 0.042
[300,     1] loss: 0.548
[400,     1] loss: 0.041
[500,     1] loss: 0.019
[600,     1] loss: 0.249
[700,     1] loss: 0.239
[800,     1] loss: 0.058
[900,     1] loss: 0.103
[1000,     1] loss: 0.417
[1100,     1] loss: 0.020
[1200,     1] loss: 0.299
[1300,     1] loss: 0.164
[1400,     1] loss: 0.039
[1500,     1] loss: 0.054
[1600,     1] loss: 0.476
[1700,     1] loss: 0.066
[1800,     1] loss: 0.004
[1900,     1] loss: 0.040
[2000,     1] loss: 0.042
[100,     2] loss: 0.419
[200,     2] loss: 0.019
[300,     2] loss: 0.529
[400,     2] loss: 0.066
[500,     2] loss: 0.002
[600,     2] loss: 0.137
[700,     2] loss: 0.042
[800,     2] loss: 0.037
[900,     2] loss: 0.111
[1000,     2] loss: 0.355
[1100,     2] loss: 0.013
[1200,     2] loss: 0.279
[1300,     2] loss: 0.036
[1400,     2] loss: 0.048
[1500,     2] loss: 0.115
[1600,     2] loss: 0.421
[1700,     2] loss: 0.021
[1800,     2] loss: 0.019
[1900,     2] loss: 0.022
[200

In [20]:
score(model, dataset, y, dataset_size=2025)

TRAIN 98.17 %


In [21]:
j_test = pd.read_csv('/content/joy-test.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
a_test= pd.read_csv('/content/anger-test.tsv', error_bad_lines=False, header=0, names=['emotion','text'], sep='\t')
df = pd.concat([j_test, a_test])
df['emotion'].replace({'not-joy':'neutral', 'not-anger':'neutral'}, inplace=True)
df = shuffle(df)
feature_mat_test = vectorizer.transform(df['text'])
mat_test = feature_mat_test.astype('float').toarray()
X_test = torch.from_numpy(mat_test)
y_test = make_labels(df['emotion'])

In [22]:
print("test")
score(model, X_test, y_test, dataset_size=len(df.index))

test
TRAIN 50.34 %
