In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression

import torch
from torch.utils.data import Dataset, DataLoader

import gensim
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import numpy as np
import re
import pickle
import os
import spacy

Загрузка и распаковка данных

In [None]:
!wget -c https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xvzf aclImdb_v1.tar.gz

Определение датасета для удобства обращения к данным

In [None]:
class ReviewsDataset(Dataset):
  def __init__(self, root_dir):
    self.pos_reviews_dir = root_dir + "/pos"
    self.pos_reviews_filenames = os.listdir(self.pos_reviews_dir)
    self.neg_reviews_dir = root_dir + "/neg"
    self.neg_reviews_filenames = os.listdir(self.neg_reviews_dir)

  def __len__(self):
    return len(self.pos_reviews_filenames) + len(self.neg_reviews_filenames)

  def __getitem__(self, idx):
    sample = {}
    if idx % 2 == 0:
      k = idx // 2
      review_idx_filename = self.pos_reviews_filenames[k]        
      id_and_rating = re.findall(r"\d+", review_idx_filename)
      with open(os.path.join(self.pos_reviews_dir, review_idx_filename), "r") as review:
        data = review.readlines()
    else:
      k = (idx - 1) // 2
      review_idx_filename = self.neg_reviews_filenames[k]
      id_and_rating = re.findall(r"\d+", review_idx_filename)
      with open(os.path.join(self.neg_reviews_dir, review_idx_filename), "r") as review:
        data = review.readlines()     
    rating = int(id_and_rating[1]) - 1
    data = str(data)
    sample = {"Text": data, "Rating": rating}
    return sample

Создание тренировочного и тестового датасетов

In [None]:
data_train = ReviewsDataset("/content/aclImdb/train")
data_test = ReviewsDataset("/content/aclImdb/test")

Извлечение текстов отзывов и оценок отзывов

In [None]:
reviews_texts_train = []
reviews_rating_train = []

reviews_texts_test = []
reviews_rating_test = []

for i in range(len(data_train)):
    reviews_texts_train.append(data_train[i]['Text'])
    reviews_rating_train.append(data_train[i]['Rating'])
    reviews_texts_test.append(data_test[i]['Text'])
    reviews_rating_test.append(data_test[i]['Rating'])

Подгрузка Spacy для предобработки текстов

In [None]:
nlp = spacy.load('en')
stop_words = nlp.Defaults.stop_words
not_stop_words = set(['not'])
stop_words -= not_stop_words

Лемматизация текстов; исключение из текстов слов, входящих в список стоп-слов

In [None]:
cleaned_tokenized_reviews_texts_train = []
i = 0

for text in reviews_texts_train:
    tokenized_text = nlp(text)
    cleaned_tokenized_text = [token.lemma_ for token in tokenized_text if token.lemma_ not in stop_words]
    cleaned_tokenized_reviews_texts_train.append(cleaned_tokenized_text)
    i += 1
    print(i)

In [None]:
cleaned_tokenized_reviews_texts_test = []
i = 0

for text in reviews_texts_test:
  tokenized_text = nlp(text)
  cleaned_tokenized_text = [token.lemma_ for token in tokenized_text if token.lemma_ not in stop_words]
  cleaned_tokenized_reviews_texts_test.append(cleaned_tokenized_text)
  i += 1
  print(i)

Формирование лейблов оценок и настроений текстов

In [None]:
y_sentiment_train = [1 if rating >= 7 else 0 for rating in reviews_rating_train]
y_sentiment_test = [1 if rating >= 7 else 0 for rating in reviews_rating_test]
y_ratings_train = reviews_rating_train
y_ratings_test = reviews_rating_test

Создание и тренировка модели FastText

In [None]:
model = FastText(window=5, min_count=1, max_vocab_size=3000)
model.build_vocab(sentences=cleaned_tokenized_reviews_texts_train)
model.train(sentences=cleaned_tokenized_reviews_texts_train, total_examples=len(cleaned_tokenized_reviews_texts_train), epochs=10)

In [None]:
X_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(cleaned_tokenized_reviews_texts_train)]
model = Doc2Vec(X_train, epochs=10)

In [None]:
X_train = [model.infer_vector(text) for text in cleaned_tokenized_reviews_texts_train]

In [None]:
X_test = [model.infer_vector(text) for text in cleaned_tokenized_reviews_texts_test]

Создание и фит предобработчика данных

In [None]:
scaler = preprocessing.StandardScaler()
scaler = scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

Тренировка логистической и линейной регрессий

In [None]:
clf = LogisticRegression(max_iter=250)
clf = clf.fit(X_scaled_train, y_sentiment_train)

In [None]:
reg = LinearRegression()
reg = reg.fit(X_scaled_train, y_ratings_train)

Вывод метрик полученных моделей

In [None]:
print("Accuracy of predictions on test data: {}".format(clf.score(X_scaled_test, y_sentiment_test)))
print("R2 metrics on test data: {}".format(reg.score(X_scaled_test, y_ratings_test)))

Accuracy of predictions on test data: 0.78932
R2 metrics on test data: 0.3961634427820562


Сохранение натренированных моделей

In [None]:
pickle.dump(clf, open('clf_model.pkl','wb'))
pickle.dump(reg, open('reg_model.pkl','wb'))
pickle.dump(scaler, open('scaler_model.pkl','wb'))
model.save('model_model')