In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.utils import to_categorical
import itertools

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


##Классификация обзоров ресторанов


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
rawdata = pd.read_csv('/content/drive/MyDrive/raw_train.csv',names=['_','review'])
rawdata

Unnamed: 0,_,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


Возьмем 10% от датасета, переведем характер отзыва в классический булевый тип 0/1. Обработаем текст отзывов: приведем к нижнему регистру, удалим пунктуацию, цифры и стоп слова, применим Стемминг и Лемматизацию.

In [23]:
data=rawdata[:round(len(rawdata)*0.1)].copy()
data

Unnamed: 0,_,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
55995,1,I took a party of 6 friends and co-workers the...
55996,2,Stop by for lunch based on Yelp reviews. Tasty...
55997,2,Best persian food ever.\nWOW!\n\nPossibly the ...
55998,1,"Habibbbbbb is the rudest, meanest, uglyist, gu..."


In [24]:
def preprocess_text(text: str):
    x=(re.sub(r"[.,;$&!?=\\_`'-/:#~]+|[\d]+", " ", text.lower()))# удаление пунктуации цифр и приведение к нижнему регистру
    x=x.translate({ord(i): None for i in '"{}%@^|+[]'})
    stop=nltk.corpus.stopwords.words('english')
    cleartext=' '.join([i for i in x.split() if i not in stop])
    sn=nltk.SnowballStemmer('english')
    wn=nltk.WordNetLemmatizer()
    restext=[wn.lemmatize(sn.stem(i)) for i in cleartext.split()]#лемматизация
    return restext

In [25]:
data._[data._==1]=0
data._[data._==2]=1
data['review'] =[preprocess_text(i) for i in data['review']]
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,_,review
0,0,"[unfortun, frustrat, dr, goldberg, patient, re..."
1,1,"[go, dr, goldberg, year, think, one, st, patie..."
2,0,"[know, dr, goldberg, like, move, arizona, let,..."
3,0,"[write, review, give, head, see, doctor, offic..."
4,1,"[food, great, best, thing, wing, wing, simpli,..."
...,...,...
55995,0,"[took, parti, friend, co, worker, last, week, ..."
55996,1,"[stop, lunch, base, yelp, review, tasti, food,..."
55997,1,"[best, persian, food, ever, nwow, n, npossibl,..."
55998,0,"[habibbbbbb, rudest, meanest, uglyist, guy, ev..."


Создаем два класса, которые делают датасет с векторным представлением отзыва. За основу взят принцип токенизации встречающихся слов. Длина вектора - количество встречающихся в отзывах выборки уникальных слов.

In [26]:
class Vocab:
  def __init__(self, X):
    self.token=list(np.unique(list(itertools.chain.from_iterable([i for i in data['review']]))))
    self.idx = range(len(self.token))
    self.idx_to_token = dict(zip(self.idx, self.token))
    self.token_to_idx = dict(zip(self.token, self.idx))
    self.vocab_len = len(self.idx_to_token)

class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, words):
    vector = torch.zeros(self.vocab.vocab_len, dtype=torch.float32)
    for l in words:
      vector[self.vocab.token_to_idx[l]] = 1
    return vector
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.vectorize(self.X[idx])
    y = torch.tensor(self.y[idx], dtype=torch.float32)
    return X, y

In [27]:
dataset = ReviewDataset([i for i in data['review']], to_categorical(data._,2),Vocab([i for i in data['review']]))

Разбиваем данные на обучающую и тестовую выборки. Создаем нейронную сеть с 5 линейными слоями.

In [28]:
dataset_train_size = round(len(dataset)*0.8)
dataset_test_size = len(dataset) - dataset_train_size
dataset_train, dataset_test = torch.utils.data.random_split(dataset, [dataset_train_size, dataset_test_size])
batch_size=2048
num_epochs = 5
num_classes = 2
learning_rate = 0.01
trainloader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
valloader = DataLoader(dataset=dataset_test, batch_size=batch_size)

In [29]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(0)


class Classifier(nn.Module):
  def __init__(self, input_features, output_features):
    super(Classifier, self).__init__()
    self.layer1 = nn.Linear(input_features, 512)
    self.batchnorm1 = nn.BatchNorm1d(512)
    self.layer2 = nn.Linear(512, 256)
    self.batchnorm2 = nn.BatchNorm1d(256)
    self.layer3 = nn.Linear(256, 128)
    self.batchnorm3 = nn.BatchNorm1d(128)
    self.layer4 = nn.Linear(128, 64)
    self.batchnorm4 = nn.BatchNorm1d(64)
    self.layer5= nn.Linear(64, output_features)

  def forward(self, x, training=True):
    x = self.layer1(x)
    x = self.batchnorm1(x)
    x = F.relu(x)
    x = F.dropout(x, training=training)
    x = self.layer2(x)
    x = self.batchnorm2(x)
    x = F.relu(x)
    x = self.layer3(x)
    x = self.batchnorm3(x)
    x = F.relu(x)
    x = self.layer4(x)
    x = self.batchnorm4(x)
    x = F.relu(x)
    x = self.layer5(x)
    x = F.softmax(x, dim=1)
    return x


model = Classifier(dataset_train[0][0].size(0), dataset_train[0][1].size(0))

In [30]:
model

Classifier(
  (layer1): Linear(in_features=42450, out_features=512, bias=True)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer2): Linear(in_features=512, out_features=256, bias=True)
  (batchnorm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (batchnorm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer4): Linear(in_features=128, out_features=64, bias=True)
  (batchnorm4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer5): Linear(in_features=64, out_features=2, bias=True)
)

In [31]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Функция по нахождению accuracy для тестовых данных.

In [32]:
def test_accuracy(model,dataloader):
  criterion = nn.CrossEntropyLoss()
  model.eval()
  accuracy = 0
  epoch_loss=0
  with torch.no_grad():
    for X_batch, y_batch in dataloader:
      y_pred = model(X_batch)
      loss = criterion(y_pred, y_batch)
      y_batch = torch.argmax(y_batch, axis=1)
      y_pred = torch.argmax(y_pred, axis=1)
      accuracy += torch.sum(y_batch == y_pred)
      epoch_loss += loss.item()
  accuracy = accuracy.item() / len(dataloader.dataset)
  return accuracy, epoch_loss

Обучаем модель, смотрим на значение ошибки и accuracy на обучающих данных и тествовых по эпохам.

In [33]:
total_step = len(trainloader)
loss_list = []
for epoch in range(num_epochs):
  for i, (X, y) in enumerate(trainloader,0):
    outputs = model(X,training=True)
    loss = criterion(outputs, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total = y.size(0)
  train_accuracy_,train_loss = test_accuracy(model,trainloader)
  print('Epoch [{}/{}], Loss: {:.4f}, trainAccuracy: {:.2f}%'
  .format(epoch + 1, num_epochs,(train_loss/len(trainloader)), train_accuracy_* 100))
  test_accuracy_,test_loss = test_accuracy(model,valloader)
  print('Loss: {:.4f}, trainAccuracy: {:.2f}%'.format((test_loss/len(valloader)), test_accuracy_* 100))
  loss_list.append((test_loss/len(valloader)))

Epoch [1/5], Loss: 0.3895, trainAccuracy: 92.05%
Loss: 0.4257, trainAccuracy: 88.27%
Epoch [2/5], Loss: 0.3669, trainAccuracy: 94.44%
Loss: 0.4121, trainAccuracy: 89.69%
Epoch [3/5], Loss: 0.3525, trainAccuracy: 96.02%
Loss: 0.4080, trainAccuracy: 90.07%
Epoch [4/5], Loss: 0.3453, trainAccuracy: 96.76%
Loss: 0.4099, trainAccuracy: 89.95%
Epoch [5/5], Loss: 0.3423, trainAccuracy: 97.04%
Loss: 0.4117, trainAccuracy: 89.83%


Проверим работу модели на случайных отзывах.
В изначальном датасете 1- отрицательный отзыв. 2 - положительный отзыв.

In [34]:
def vectorize(words, vocab):
  vector = torch.zeros(vocab.vocab_len, dtype=torch.float32)
  for l in words:
    vector[vocab.token_to_idx[l]] = 1
  return vector

In [52]:
reviewone =  rawdata['review'][435]
print(rawdata['_'][435])
pred=model(vectorize( preprocess_text(reviewone),Vocab(data['review'])).unsqueeze(0))
print('Bad: {:.4f}%, Good: {:.2f}%'.format(pred.tolist()[0][0]*100, pred.tolist()[0][1]*100))

1
Bad: 100.0000%, Good: 0.00%


In [56]:
reviewone =  rawdata['review'][0]
print(rawdata['_'][0])
pred=model(vectorize( preprocess_text(reviewone),Vocab(data['review'])).unsqueeze(0))
print('Bad: {:.4f}%, Good: {:.2f}%'.format(pred.tolist()[0][0]*100, pred.tolist()[0][1]*100))

1
Bad: 100.0000%, Good: 0.00%


In [54]:
reviewone =  rawdata['review'][436366]
print(rawdata['_'][436366])
pred=model(vectorize( preprocess_text(reviewone),Vocab(data['review'])).unsqueeze(0))
print('Bad: {:.4f}%, Good: {:.2f}%'.format(pred.tolist()[0][0]*100, pred.tolist()[0][1]*100))

2
Bad: 0.0000%, Good: 100.00%



In [55]:
reviewone = 'all is great perfect brilliant'
pred=model(vectorize(preprocess_text(reviewone),Vocab(data['review'])).unsqueeze(0))
print('Bad: {:.4f}%, Good: {:.2f}%'.format(pred.tolist()[0][0]*100, pred.tolist()[0][1]*100))

Bad: 0.0000%, Good: 100.00%


Построенная модель показывает хорошую точность. Модель корректно опрделяет тип отзыва. Главным недостатком такого подхода является условие, что все слова изучаемого отзыва должны были присутствовать в списке слов при обучении для корректноо создания вектора.