In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/reviews_analysis')

In [4]:
import pandas as pd
from datetime import datetime

In [7]:
reviews = pd.read_csv('all_reviews.csv', encoding='utf-8', parse_dates=['date'], date_format="%Y-%m-%d %H:%M:%S.%f")

In [8]:
reviews.head()

Unnamed: 0,id,date,stars,review_text
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э..."
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,..."
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на..."
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н..."
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу..."


In [9]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12221 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12221 non-null  int64         
 1   date         12221 non-null  datetime64[ns]
 2   stars        12218 non-null  float64       
 3   review_text  12219 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 382.0+ KB


In [10]:
def find_NaN(df):
  return df[df.isna().any(axis=1)]

In [11]:
find_NaN(reviews)

Unnamed: 0,id,date,stars,review_text
1708,25010521811,2024-03-26 16:45:23.734,5.0,
4413,74794040308,2024-06-05 19:38:31.052,,Здравствуйте. Хочу пожаловаться на выданный за...
5152,88497708729,2022-09-16 19:22:52.779,,30 секунд и все что заказал получил... Ходить ...
10562,207911795530,2024-02-06 13:21:56.288,5.0,
11248,221560574924,2021-06-26 17:28:32.749,,"Небольшой, уютный"


In [12]:
filtered_reviews = reviews.dropna()
filtered_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12216 entries, 0 to 12220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           12216 non-null  int64         
 1   date         12216 non-null  datetime64[ns]
 2   stars        12216 non-null  float64       
 3   review_text  12216 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 477.2+ KB


In [13]:
!pip install dostoevsky

Collecting dostoevsky
  Downloading dostoevsky-0.6.0-py2.py3-none-any.whl (8.5 kB)
Collecting fasttext==0.9.2 (from dostoevsky)
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting razdel==0.5.0 (from dostoevsky)
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting pybind11>=2.2 (from fasttext==0.9.2->dostoevsky)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227136 sha256=fd67d2482db77ed7b7098bb69b205de572ebc1f1e869e01c2e1399bdbc85228b
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected pa

In [14]:
!python -m dostoevsky download fasttext-social-network-model

In [15]:
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

In [16]:
model = FastTextSocialNetworkModel(tokenizer=RegexTokenizer())



In [17]:
def get_sentiment_analysis(data, model):
  df = data.copy()

  results = model.predict(df['review_text'])
  keys = ['positive', 'neutral', 'negative']
  for key in keys:
    df[key] = [res.get(key) for res in results]

  df[keys] = df[keys].fillna(value=0)
  return df

In [18]:
sent_rev = get_sentiment_analysis(filtered_reviews, model)
sent_rev

Unnamed: 0,id,date,stars,review_text,positive,neutral,negative
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",0.053413,0.407343,0.314061
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",0.938134,0.005740,0.071601
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",0.173298,0.245095,0.140346
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",0.422515,0.177821,0.177821
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",0.777310,0.022987,0.065615
...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,0.228166,0.644235,0.012831
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,0.005921,0.855861,0.007131
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,0.058356,0.771854,0.026769
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,0.000010,0.000677,0.000010


In [19]:
def get_marks(data):
  df = data.copy()
  s = df['positive'] + df['neutral'] + df['negative']
  df['mark'] = (5*df['positive'] + 4*df['neutral'] + 1*df['negative'])/s
  return df

In [20]:
estimated = get_marks(sent_rev)
estimated

Unnamed: 0,id,date,stars,review_text,positive,neutral,negative,mark
0,1276123110,2024-01-13 08:17:29.661,5.0,"У меня к озону, как к сервису, и конкретно к э...",0.053413,0.407343,0.314061,2.852932
1,1276123110,2024-04-04 13:23:17.859,5.0,"Очень хороший пункт, удобно расположен, чисто,...",0.938134,0.005740,0.071601,4.712307
2,1276123110,2023-01-01 12:19:35.464,4.0,"Пункт выдачи удобно расположен, очень легко на...",0.173298,0.245095,0.140346,3.556608
3,1276123110,2023-05-09 10:59:03.823,5.0,"Все отлично, расположение, чисто, просторно, н...",0.422515,0.177821,0.177821,3.857422
4,1276123110,2023-11-09 16:21:06.131,5.0,"хороший пункт, приятная работница, заказы полу...",0.777310,0.022987,0.065615,4.670351
...,...,...,...,...,...,...,...,...
12216,245536924647,2022-05-03 05:31:04.866,5.0,Быстрое обслуживание,0.228166,0.644235,0.012831,4.214263
12217,245536924647,2022-12-18 13:21:45.710,5.0,Всё по делу,0.005921,0.855861,0.007131,3.982192
12218,245536924647,2021-10-05 16:58:36.684,5.0,Высокий уровень обслуживания,0.058356,0.771854,0.026769,3.974384
12219,245536924647,2022-05-11 15:30:47.822,5.0,Спасибо Оzon,0.000010,0.000677,0.000010,3.971293


In [21]:
from sklearn.metrics import mean_absolute_error

In [22]:
mean_absolute_error(estimated['stars'], estimated['mark'])

1.1746076504547496

In [23]:
estimated.to_csv('dostoevsky.csv', encoding='utf-8')