In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('/content/drive/MyDrive/reviews_analysis')

In [48]:
import pandas as pd
import numpy as np
from datetime import datetime

In [1]:
from transformers import pipeline

In [6]:
reviews = pd.read_csv('all_reviews.csv', encoding='utf-8', parse_dates=['date'], date_format="%Y-%m-%d %H:%M:%S.%f")

In [8]:
reviews.head()

Unnamed: 0,id,date,stars,review_text
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э..."
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,..."
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на..."
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н..."
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу..."


In [9]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12221 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12221 non-null  int64         
 1   date         12221 non-null  datetime64[ns]
 2   stars        12218 non-null  float64       
 3   review_text  12219 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 382.0+ KB


In [10]:
def find_NaN(df):
  return df[df.isna().any(axis=1)]

In [11]:
find_NaN(reviews)

Unnamed: 0,id,date,stars,review_text
1708,25010521811,2024-03-26 16:45:23.734,5.0,
4413,74794040308,2024-06-05 19:38:31.052,,Здравствуйте. Хочу пожаловаться на выданный за...
5152,88497708729,2022-09-16 19:22:52.779,,30 секунд и все что заказал получил... Ходить ...
10562,207911795530,2024-02-06 13:21:56.288,5.0,
11248,221560574924,2021-06-26 17:28:32.749,,"Небольшой, уютный"


In [12]:
filtered_reviews = reviews.dropna()
filtered_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12216 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12216 non-null  int64         
 1   date         12216 non-null  datetime64[ns]
 2   stars        12216 non-null  float64       
 3   review_text  12216 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 477.2+ KB


In [76]:
model = pipeline(model="seara/rubert-tiny2-russian-sentiment", top_k=None)

In [112]:
def get_value_by_key(l, key):
  for d in l:
    if d.get('label') == key:
      return d.get('score')

  return 0

In [87]:
def get_sentiment_analysis(data, model):
  df = data.copy()

  results = model.predict(df['review_text'].to_list())
  keys = ['positive', 'neutral', 'negative']
  for key in keys:
    df[key] = [get_value_by_key(res, key) for res in results]

  return df

In [88]:
sent_rev = get_sentiment_analysis(filtered_reviews, model)
sent_rev

Unnamed: 0,id,date,stars,review_text,positive,neutral,negative
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",0.231608,0.549817,0.218575
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",0.995171,0.003455,0.001373
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",0.606572,0.368446,0.024981
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",0.993947,0.004638,0.001415
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",0.994134,0.004789,0.001077
...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,0.910467,0.080456,0.009077
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,0.055411,0.769480,0.175109
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,0.263950,0.649896,0.086154
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,0.952002,0.044298,0.003700


In [111]:
sent_rev['stars'].mean()

4.138425016371971

In [105]:
def get_marks(data):
  df = data.copy()
  s = df['positive'] + df['neutral'] + df['negative']
  df['mark'] = (5*df['positive'] + 4*df['neutral'] + 1*df['negative'])/s
  return df

In [107]:
estimated = get_marks(sent_rev)
estimated

Unnamed: 0,id,date,stars,review_text,positive,neutral,negative,mark
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",0.231608,0.549817,0.218575,3.575884
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",0.995171,0.003455,0.001373,4.991051
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",0.606572,0.368446,0.024981,4.531628
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",0.993947,0.004638,0.001415,4.989702
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",0.994134,0.004789,0.001077,4.990904
...,...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,0.910467,0.080456,0.009077,4.883237
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,0.055411,0.769480,0.175109,3.530085
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,0.263950,0.649896,0.086154,4.005486
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,0.952002,0.044298,0.003700,4.940901


In [118]:
estimated[estimated['review_text'].str.len()>2000]

Unnamed: 0,id,date,stars,review_text,positive,neutral,negative,mark
57,1347737832,2022-08-18 17:49:30.390,1.0,Снова оставлю свой отпечаток тут в виде текста...,0.019051,0.25466,0.726289,1.840185
2540,44551846782,2021-12-25 09:21:03.003,5.0,"Почитала я комментарии,и понимаю на сколько ко...",0.009639,0.184459,0.805902,1.591933
3827,68721290739,2022-12-27 18:33:05.070,1.0,Всем добрый день. Плюсы... ничего не могу сказ...,0.024272,0.231117,0.744611,1.79044
6241,120290578182,2024-03-24 12:04:09.386,1.0,Уже несколько лет забираю покупки в «Озоне» по...,0.02345,0.321796,0.654754,2.059187
7787,156334246917,2023-01-27 13:19:03.955,1.0,"САМЫЙ ТОРМОЗНУТЫЙ ПУНКТ. Добрый день, пишу жал...",0.003047,0.06741,0.929543,1.214417
11366,227211064249,2022-01-11 14:39:39.372,1.0,"Так-же отпишусь, отзыв не к пункту выдачи, а о...",0.026385,0.310588,0.663027,2.037302


In [108]:
from sklearn.metrics import mean_absolute_error

In [109]:
mean_absolute_error(estimated['stars'], estimated['mark'])

0.5088374776357787

In [119]:
estimated.to_csv('rubert-tiny2.csv', encoding='utf-8')