In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/reviews_analysis')

In [3]:
import pandas as pd
from datetime import datetime

In [4]:
def count_duplicates(df):
  return df.groupby(df.columns.tolist(), as_index= False).size()

In [5]:
def filter_duplicates(df):
  ndf = count_duplicates(df)
  return ndf.drop(['size'], axis=1)

In [6]:
reviews = pd.read_csv('all_reviews.csv', encoding='utf-8', parse_dates=['date'], date_format="%Y-%m-%d %H:%M:%S.%f")

In [7]:
reviews.head()

Unnamed: 0,id,date,stars,review_text
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э..."
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,..."
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на..."
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н..."
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу..."


In [8]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12221 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12221 non-null  int64         
 1   date         12221 non-null  datetime64[ns]
 2   stars        12218 non-null  float64       
 3   review_text  12219 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 382.0+ KB


In [9]:
def find_NaN(df):
  return df[df.isna().any(axis=1)]

In [10]:
find_NaN(reviews)

Unnamed: 0,id,date,stars,review_text
1708,25010521811,2024-03-26 16:45:23.734,5.0,
4413,74794040308,2024-06-05 19:38:31.052,,Здравствуйте. Хочу пожаловаться на выданный за...
5152,88497708729,2022-09-16 19:22:52.779,,30 секунд и все что заказал получил... Ходить ...
10562,207911795530,2024-02-06 13:21:56.288,5.0,
11248,221560574924,2021-06-26 17:28:32.749,,"Небольшой, уютный"


In [94]:
filtered_reviews = reviews.dropna()
filtered_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12216 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12216 non-null  int64         
 1   date         12216 non-null  datetime64[ns]
 2   stars        12216 non-null  float64       
 3   review_text  12216 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 477.2+ KB


In [37]:
from transformers import pipeline

In [41]:
model = pipeline(model="seara/rubert-base-cased-russian-sentiment", top_k=None)

In [30]:
def get_value_by_key(l, key):
  for d in l:
    if d.get('label') == key:
      return d.get('score')

  return 0

In [31]:
def get_sentiment_analysis(data, model):
  df = data.copy()

  results = model.predict(df['review_text'].to_list())
  keys = ['positive', 'neutral', 'negative']
  for key in keys:
    df[key] = [get_value_by_key(res, key) for res in results]

  return df

In [None]:
filtered_reviews['text_length'] = filtered_reviews['review_text'].str.len()

In [96]:
filter_len = filtered_reviews[filtered_reviews['text_length'] < 2000]

In [97]:
filtered_reviews[filtered_reviews['text_length'] > 2000]

Unnamed: 0,id,date,stars,review_text,text_length
57,1347737832,2022-08-18 17:49:30.390,1.0,Снова оставлю свой отпечаток тут в виде текста...,5807
2540,44551846782,2021-12-25 09:21:03.003,5.0,"Почитала я комментарии,и понимаю на сколько ко...",2215
3827,68721290739,2022-12-27 18:33:05.070,1.0,Всем добрый день. Плюсы... ничего не могу сказ...,2251
6241,120290578182,2024-03-24 12:04:09.386,1.0,Уже несколько лет забираю покупки в «Озоне» по...,2823
7787,156334246917,2023-01-27 13:19:03.955,1.0,"САМЫЙ ТОРМОЗНУТЫЙ ПУНКТ. Добрый день, пишу жал...",2325
11366,227211064249,2022-01-11 14:39:39.372,1.0,"Так-же отпишусь, отзыв не к пункту выдачи, а о...",2212


In [79]:
sent_rev = get_sentiment_analysis(filter_len, model)
sent_rev

Unnamed: 0,id,date,stars,review_text,text_lenght,positive,neutral,negative
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",331,0.400817,0.547377,0.051806
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",67,0.996105,0.002512,0.001383
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",257,0.327667,0.642293,0.030039
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",76,0.996277,0.002724,0.000999
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",68,0.996340,0.002836,0.000824
...,...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,20,0.719184,0.274255,0.006561
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,11,0.016143,0.968565,0.015291
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,28,0.305603,0.668827,0.025571
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,12,0.977438,0.012087,0.010476


31 minutes

In [80]:
def get_marks(data):
  df = data.copy()
  s = df['positive'] + df['neutral'] + df['negative']
  df['mark'] = (5*df['positive'] + 4*df['neutral'] + 1*df['negative'])/s
  return df

In [81]:
estimated = get_marks(sent_rev)
estimated

Unnamed: 0,id,date,stars,review_text,text_lenght,positive,neutral,negative,mark
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",331,0.400817,0.547377,0.051806,4.245399
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",67,0.996105,0.002512,0.001383,4.991957
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",257,0.327667,0.642293,0.030039,4.237549
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",76,0.996277,0.002724,0.000999,4.993279
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",68,0.996340,0.002836,0.000824,4.993867
...,...,...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,20,0.719184,0.274255,0.006561,4.699502
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,11,0.016143,0.968565,0.015291,3.970269
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,28,0.305603,0.668827,0.025571,4.228890
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,12,0.977438,0.012087,0.010476,4.946011


In [84]:
from sklearn.metrics import mean_absolute_error

In [85]:
mean_absolute_error(estimated['stars'], estimated['mark'])

0.4357123176483525

In [92]:
estimated.drop(['text_length'], axis = 1).to_csv('rubert-base-cased.csv', encoding='utf-8')