In [2]:
import pandas as pd
from google.colab import drive
import string
import nltk
from pymystem3 import Mystem
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_json("/content/drive/MyDrive/hack/train.json")

In [5]:
df.drop(["id"], axis=1, inplace=True)

In [6]:
df["sentiment"] = df["sentiment"].replace("positive", 1).replace("neutral", 0).replace("negative", -1)

In [7]:
df['text'] = df['text'].str.lower()

In [8]:
df['text'] = df['text'].apply(lambda text: ''.join(c for c in text if c not in set(string.punctuation + string.digits + '—–\n«»')))

In [9]:
df['text'] = df['text'].apply(word_tokenize)

In [10]:
with open("/content/drive/MyDrive/hack/stopwords-ru.txt", "r", encoding="UTF-8") as f:
  stopwords = set(map(lambda x: x.strip(), f.readlines()))
  df['text'] = df['text'].apply(lambda text: [w for w in text if w not in stopwords])

In [11]:
m = Mystem()
df['text'] = df['text'].apply(lambda text: [m.lemmatize(w)[0] for w in text])

In [12]:
with open("/content/drive/MyDrive/hack/stopwords-ru.txt", "r", encoding="UTF-8") as f:
  stopwords = set(map(lambda x: x.strip(), f.readlines()))
  df['text'] = df['text'].apply(lambda text: [w for w in text if w not in stopwords])

In [13]:
from gensim.models import Word2Vec
w2v = Word2Vec(df['text'], min_count = 0)
df['text'] = df['text'].apply(lambda text: [w2v.wv[word] for word in text])

In [14]:
df['text'].apply(lambda x: len(x)).describe()

count     8263.000000
mean       311.655452
std        734.930527
min          2.000000
25%        104.000000
50%        169.000000
75%        294.000000
max      30463.000000
Name: text, dtype: float64

In [15]:
import numpy as np

In [None]:
df['text'] = df['text'].apply(lambda text: np.array(text + [np.zeros(100)] * (30463 - len(text))))

In [40]:
import torch
import torch.nn as nn

In [68]:
class Model(nn.Module):

	def __init__(self):
		super().__init__()

		self.embed_dim = 100
		self.hidden_dim = 100
		self.lstm_layers = 2

		self.lstm = nn.LSTM(input_size=self.embed_dim, hidden_size=self.hidden_dim, num_layers=self.lstm_layers, batch_first=True)
		self.dropout = nn.Dropout(0.5)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=256)
		self.fc2 = nn.Linear(256, 1)

	def forward(self, X):
		h = torch.zeros((self.lstm_layers, X.size(0), self.hidden_dim))
		c = torch.zeros((self.lstm_layers, X.size(0), self.hidden_dim))
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		out, (h, c) = self.lstm(X, (h, c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:, -1, :]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out


model = Model()

In [82]:
from torch.utils.data import DataLoader

class Dataset(torch.utils.data.Dataset):
	def __init__(self, X, y):
		self.X = X
		self.y = y

	def __len__(self):
		return len(self.X)

	def __getitem__(self, idx):
		return self.X[idx], self.y[idx]


x_train, x_test, y_train, y_test = train_test_split(df['text'].values,
  df["sentiment"].values, test_size=0.1)
train = DataLoader(Dataset(np.array(x_train), np.array(y_train)), batch_size=64)
test = DataLoader(Dataset(np.array(x_test), np.array(y_test)))

In [98]:
df['text'][0]

[array([-5.2354723e-01, -3.7230232e-01, -5.9604257e-01,  1.1629984e+00,
         5.2398145e-01, -2.2231443e+00, -2.2760913e+00,  1.5514232e+00,
        -3.7612951e-01,  9.2514912e-03, -5.3213930e-01, -1.1801227e+00,
         2.1826018e-01,  9.9852943e-01, -1.3229167e+00, -1.1972374e+00,
        -8.1321460e-01, -1.2760786e+00,  6.6912562e-01,  1.2042792e+00,
         4.2441997e-01,  3.0203071e-01, -6.7938226e-01,  4.1930637e-01,
         5.9162432e-01, -8.9536136e-01, -5.9093165e-01, -6.7028946e-01,
         1.3009211e+00,  1.2642211e+00, -7.3546183e-01,  1.1022106e+00,
         8.2153171e-01,  8.3095498e-02, -2.5836639e+00,  1.0505662e+00,
         4.6503940e-01, -1.2221822e+00, -5.4806663e-04, -1.0318537e+00,
        -5.2838498e-01, -6.4367712e-02, -1.3188585e+00,  3.4580842e-01,
         3.2824355e-01, -2.0717962e+00, -2.8640609e+00, -2.7092466e-02,
         1.3096457e+00, -1.0780940e+00,  1.6868935e+00, -2.6922381e+00,
         1.1048234e+00, -2.3203340e-01, -1.4195273e+00,  7.95062

In [65]:
import torch.optim as optim
import torch.nn.functional as F

In [64]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)

In [66]:
def evaluation(self):

		predictions = []
		model.eval()
		with torch.no_grad():
			for x_batch, y_batch in test:
				x = x_batch.type(torch.LongTensor)
				y = y_batch.type(torch.FloatTensor)

				y_pred = model(x)
				predictions += list(y_pred.detach().numpy())

		return predictions

In [75]:
i = 0
for x_batch, y_batch in train:
  if i != 0:
    break
  print(x_batch)
  i += 1

Output hidden; open in https://colab.research.google.com to view.

In [76]:
i = 0
for x_batch, y_batch in train:
  if i != 0:
    break
  print(y_batch)
  i += 1

tensor([ 0,  1,  0,  0,  0,  0,  0,  1, -1,  1,  0, -1,  1,  1,  1,  1,  1,  1,
         0,  1,  0,  0, -1,  1,  0, -1,  0,  1,  0,  1,  0,  0, -1,  0, -1,  0,
         0,  0,  0,  1,  1,  0,  0,  0, -1,  1, -1,  1,  1,  1, -1,  0,  0,  0,
        -1, -1,  0,  0,  0,  1,  1,  0,  0, -1])


In [77]:
def calculate_accuracy(grand_truth, predictions):
		true_positives = 0
		true_negatives = 0

		for true, pred in zip(grand_truth, predictions):
			if (pred > 0.5) and (true == 1):
				true_positives += 1
			elif (pred < 0.5) and (true == 0):
				true_negatives += 1
			else:
				pass

		return (true_positives + true_negatives) / len(grand_truth)

In [97]:
for epoch in range(200):
  predictions = []
  model.train()

  for x_batch, y_batch in train:
    print(x_batch[0])
    print(x_batch[0].shape, len(x_batch))
    y_pred = model(x_batch)

    loss = F.binary_cross_entropy(y_pred, y_batch)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    predictions += list(y_pred.squeeze().detach().numpy())

  test_predictions = evaluation()

  train_accuary = calculate_accuracy(y_train, predictions)
  test_accuracy = calculate_accuracy(y_test, test_predictions)

  print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))

Exception ignored in: <function _xla_gc_callback at 0x7e880a0a43a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


tensor([[-0.2678, -0.1192, -0.7159,  ..., -0.2040,  0.5688,  0.0342],
        [-0.9871, -0.2584,  1.9421,  ...,  1.0762, -2.3688, -0.2001],
        [ 0.2413,  2.5103,  1.6985,  ..., -0.0754,  1.0041, -0.9784],
        ...,
        [-0.1341,  0.5208,  0.4865,  ..., -0.2526,  0.1072, -0.1565],
        [ 0.7627,  0.6724, -0.1796,  ..., -1.5888, -0.0600, -2.5622],
        [-2.4659, -1.6407, -0.8265,  ...,  2.0703,  0.1934,  0.8500]])
torch.Size([64, 100]) 64


AttributeError: 'list' object has no attribute 'size'