# Обучение CatBoostClassifier для анализа тональности

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from catboost import CatBoostClassifier
import re

In [6]:
data = pd.read_pickle('sentiment_texts.pickle')
data.head(2)

Unnamed: 0,MessageID,ChannelID,issuerid,SentimentScore,DateAdded,DatePosted,MessageText,IsForward
0,241407,1203560567,153,2,2023-05-12 19:03:20,2023-05-12 19:02:42,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,False
1,33684,1136626166,230,4,2023-02-03 20:56:29,2023-02-03 16:46:34,Ozon продолжает развивать специализированные ф...,False


Чистим тест

In [4]:
def clear_text(df):
    remove_question = lambda text: re.sub(r'\?{2,}', '', text)
    df['MessageTextClean'] = df['MessageText'].apply(remove_question)
    remove_u200b = lambda text: re.sub(r'\u200b', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_u200b)
    remove_backslash = lambda text: re.sub(r'\\[^ ]*', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_backslash)
    remove_at = lambda text: re.sub(r'\@\S{2,}', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_at)
    remove_site = lambda text: re.sub(r'http\S{2,}', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_site)
    remove_space = lambda text: re.sub(r'\s{2,}', ' ', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_space)
    remove_quotation = lambda text: re.sub(r'\"{2,}', '"', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_quotation)
    return df

In [7]:
%%time
data = clear_text(data)

CPU times: user 556 ms, sys: 22.4 ms, total: 578 ms
Wall time: 578 ms


In [8]:
data = data[['issuerid', 'SentimentScore', 'MessageTextClean']]
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageTextClean
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 202...
1,230,4,Ozon продолжает развивать специализированные ф...
2,118,4,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
3,220,5,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
4,89,2,Windfall Tax — налог на сверхприбыль. Какие ко...


Убираем нулевой класс

In [9]:
data.loc[data['SentimentScore']==0, 'SentimentScore'] = 1

In [10]:
data

Unnamed: 0,issuerid,SentimentScore,MessageTextClean
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 202...
1,230,4,Ozon продолжает развивать специализированные ф...
2,118,4,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
3,220,5,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
4,89,2,Windfall Tax — налог на сверхприбыль. Какие ко...
...,...,...,...
9284,157,4,#FLOT #Дивиденды 💰 7% — возможная дивдоходност...
9285,157,4,🇷🇺#FLOT #отчетность ЧИСТАЯ ПРИБЫЛЬ СОВКОМФЛОТА...
9286,225,3,Ключевой принцип создания портфеля 🔹Диверсифик...
9287,127,3,"""💥🇷🇺#PLZL #листинг #торги ""Полюс"" ведет диалог..."


In [5]:
names = pd.read_csv('company_names2.csv')
names.head(2)

Unnamed: 0,issuerid,l_syns
0,1,"['Держава', 'DERZP', 'DERZ', 'DERZHAVA', '""Акц..."
1,2,"['CBOM RX', '""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публ..."


Объединяем два датасета по индексу компании

In [11]:
df_names = pd.merge(data, names, on="issuerid", how="left")

In [12]:
del df_names['issuerid']

In [13]:
df_names.dropna(inplace=True)

Смайлы и специальные символы встречаются слитно с другим словами, но catboost делит на токены по пробелу, отделим символы. Заменим каждый специальный символ на этот же символ, но с пробелом слева и справа. Удалим лишние пробелы

In [None]:
%%time
import re

def f(x):
    x = re.sub(r'([^a-zA-Zа-яА-яёЁ0-9 ])', r' \1 ', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

df_names['MessageTextClean'] = df_names['MessageTextClean'].apply(lambda x: f(x))
df_names['l_syns'] = df_names['l_syns'].apply(lambda x: f(x))

CPU times: user 2.14 s, sys: 29.6 ms, total: 2.17 s
Wall time: 2.24 s


In [None]:
df_names.loc[df_names['SentimentScore']==0, 'SentimentScore'] = 1
data4 = df_names.loc[df_names['SentimentScore']==4].sample(1500)
data3 = df_names.loc[df_names['SentimentScore']==3].sample(1500)
otherdata = df_names.loc[(df_names['SentimentScore']<3)|(df_names['SentimentScore']==5)]
newdata = pd.concat([otherdata, data3, data4], axis=0)

In [None]:
df_names = newdata

Делим в соотношении 70 на 30

In [None]:
!pip install imbalanced-learn



In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=77)

In [None]:
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
X_resampled.shape

(9669, 2)

In [None]:
y_train.shape

(6500,)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_names.drop(['SentimentScore'], axis=1),
                                                    df_names['SentimentScore'],
                                                    random_state=77,
                                                    stratify=df_names['SentimentScore'],
                                                    test_size=0.3)

Две фичи:
 - текст с разделёнными спец. символами + смайлами
 - текст с синонимами компании, перечисленных через запятую

Рекомендации или если бы хватило времени:
- леммаизация (стеминг) слов
- удаление стоп-слов
- удаление пунктуационных символов

In [16]:
model = CatBoostClassifier(iterations=20000,
                           text_features=['MessageTextClean', 'l_syns'],
                           random_state=77,
                           task_type="GPU",
                           metric_period = 500)
model.fit(X_train, y_train)

Learning rate set to 0.007841
0:	learn: 1.5999259	total: 35ms	remaining: 11m 40s
500:	learn: 0.9397094	total: 5.71s	remaining: 3m 42s
1000:	learn: 0.8852653	total: 13.5s	remaining: 4m 17s
1500:	learn: 0.8481666	total: 18.8s	remaining: 3m 51s
2000:	learn: 0.8167930	total: 26.3s	remaining: 3m 56s
2500:	learn: 0.7899795	total: 31.5s	remaining: 3m 40s
3000:	learn: 0.7658090	total: 39s	remaining: 3m 41s
3500:	learn: 0.7436817	total: 44.3s	remaining: 3m 28s
4000:	learn: 0.7228363	total: 51.8s	remaining: 3m 27s
4500:	learn: 0.7031062	total: 57s	remaining: 3m 16s
5000:	learn: 0.6852295	total: 1m 4s	remaining: 3m 13s
5500:	learn: 0.6679583	total: 1m 9s	remaining: 3m 3s
6000:	learn: 0.6517837	total: 1m 17s	remaining: 3m 1s
6500:	learn: 0.6366248	total: 1m 22s	remaining: 2m 51s
7000:	learn: 0.6226823	total: 1m 30s	remaining: 2m 47s
7500:	learn: 0.6090948	total: 1m 35s	remaining: 2m 39s
8000:	learn: 0.5962266	total: 1m 43s	remaining: 2m 35s
8500:	learn: 0.5835158	total: 1m 49s	remaining: 2m 27s
90

<catboost.core.CatBoostClassifier at 0x7d9f24845900>

In [17]:
predict = model.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(predict, y_test, labels=[1, 2, 3, 4, 5]))

              precision    recall  f1-score   support

           1       0.50      0.61      0.55        56
           2       0.33      0.55      0.41       166
           3       0.63      0.71      0.67       941
           4       0.78      0.63      0.70      1422
           5       0.50      0.56      0.53       201

    accuracy                           0.65      2786
   macro avg       0.55      0.61      0.57      2786
weighted avg       0.68      0.65      0.65      2786



In [18]:
accuracy_score(predict, y_test)

0.6468054558506819

In [19]:
f1_score(predict, y_test, average='weighted')

0.6544384047793291

Для инференса обучим на всём датасете

In [None]:
model = CatBoostClassifier(iterations=20000,
                           text_features=['MessageTextClean', 'l_syns'],
                           random_state=77,
                           metric_period = 500)
model.fit(df_names.drop(['SentimentScore'], axis=1), df_names['SentimentScore'])

In [None]:
model.save_model('CatBoost_20000')