# Инференс Target sentiment analysis Catboost

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from catboost import CatBoostClassifier
import re

Считываем данные

In [21]:
data = pd.read_pickle('sentiment_texts.pickle')
data.head(2)

Unnamed: 0,MessageID,ChannelID,issuerid,SentimentScore,DateAdded,DatePosted,MessageText,IsForward
0,241407,1203560567,153,2,2023-05-12 19:03:20,2023-05-12 19:02:42,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,False
1,33684,1136626166,230,4,2023-02-03 20:56:29,2023-02-03 16:46:34,Ozon продолжает развивать специализированные ф...,False


Чистим тест. Убираем явный мусор

In [22]:
def clear_text(df):
    remove_question = lambda text: re.sub(r'\?{2,}', '', text)
    df['MessageTextClean'] = df['MessageText'].apply(remove_question)
    remove_u200b = lambda text: re.sub(r'\u200b', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_u200b)
    remove_backslash = lambda text: re.sub(r'\\[^ ]*', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_backslash)
    remove_at = lambda text: re.sub(r'\@\S{2,}', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_at)
    remove_site = lambda text: re.sub(r'http\S{2,}', '', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_site)
    remove_space = lambda text: re.sub(r'\s{2,}', ' ', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_space)
    remove_quotation = lambda text: re.sub(r'\"{2,}', '"', text)
    df['MessageTextClean'] = df['MessageTextClean'].apply(remove_quotation)
    return df

In [23]:
%%time
data = clear_text(data)

CPU times: total: 266 ms
Wall time: 358 ms


Будем работать со следующими столбцами

In [24]:
data = data[['issuerid', 'SentimentScore', 'MessageTextClean']]
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageTextClean
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 202...
1,230,4,Ozon продолжает развивать специализированные ф...
2,118,4,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
3,220,5,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...
4,89,2,Windfall Tax — налог на сверхприбыль. Какие ко...


Нулейвой класс заменим на первый

In [4]:
data.loc[data['SentimentScore']==0, 'SentimentScore'] = 1

Открываем предобработанный датасет с синонимами названий компаний

In [5]:
names = pd.read_pickle('company_names2.pickle')
names.head(2)

Unnamed: 0,issuerid,l_syns
0,1,"[Держава, DERZP, DERZ, DERZHAVA, ""Акционерный ..."
1,2,"[CBOM RX, ""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публичн..."


Преобразуем список в строку с названиями через запятую

In [6]:
names['l_syns'] = names['l_syns'].apply(lambda x: ', '.join(x))
names.head()

Unnamed: 0,issuerid,l_syns
0,1,"Держава, DERZP, DERZ, DERZHAVA, ""Акционерный к..."
1,2,"CBOM RX, ""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публично..."
2,3,"РДБанк, roads Bank, Российский акционерный ком..."
3,4,"ALRS RX, alrosa, Акционерная компания ""АЛРОСА""..."
4,5,"AVANGARD, Авангард, AVAN, Акционерный Коммерче..."


Объединяем два датасета по индексу компании

In [7]:
df_names = pd.merge(data, names, on="issuerid", how="left")

Индекс использовать не будем

In [8]:
del df_names['issuerid']

In [9]:
df_names.dropna(inplace=True)

Смайлы и специальные символы встречаются слитно с другим словами, но catboost делит на токены по пробелу, отделим символы. Заменим каждый специальный символ на этот же символ, но с пробелом слева и справа. Удалим лишние пробелы

In [10]:
%%time
import re

def f(x):
    x = re.sub(r'([^a-zA-Zа-яА-яёЁ0-9 ])', r' \1 ', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

df_names['MessageTextClean'] = df_names['MessageTextClean'].apply(lambda x: f(x))
df_names['l_syns'] = df_names['l_syns'].apply(lambda x: f(x))

CPU times: total: 688 ms
Wall time: 743 ms


Делим в соотношении 70 на 30

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_names.drop(['SentimentScore'], axis=1),
                                                    df_names['SentimentScore'],
                                                    random_state=77,
                                                    stratify=df_names['SentimentScore'],
                                                    test_size=0.3)

Две фичи:
 - текст с разделёнными спец. символами + смайлами
 - текст с синонимами компании, перечисленных через запятую

Рекомендации или если бы хватило времени:
- леммаизация (стеминг) слов
- удаление стоп-слов
- удаление пунктуационных символов

In [29]:
model = CatBoostClassifier(iterations=2000,
                           text_features=['MessageTextClean', 'l_syns'],
                           random_state=77)
model.fit(X_train, y_train)

Learning rate set to 0.048463
0:	learn: 1.5568027	total: 171ms	remaining: 5m 42s
1:	learn: 1.5116678	total: 429ms	remaining: 7m 8s
2:	learn: 1.4693143	total: 681ms	remaining: 7m 33s
3:	learn: 1.4372895	total: 932ms	remaining: 7m 44s
4:	learn: 1.4063299	total: 1.18s	remaining: 7m 51s
5:	learn: 1.3780994	total: 1.44s	remaining: 7m 57s
6:	learn: 1.3518412	total: 1.67s	remaining: 7m 56s
7:	learn: 1.3300516	total: 1.91s	remaining: 7m 54s
8:	learn: 1.3101713	total: 2.15s	remaining: 7m 55s
9:	learn: 1.2874955	total: 2.39s	remaining: 7m 55s
10:	learn: 1.2674329	total: 2.62s	remaining: 7m 54s
11:	learn: 1.2468473	total: 2.87s	remaining: 7m 55s
12:	learn: 1.2297794	total: 3.12s	remaining: 7m 57s
13:	learn: 1.2170902	total: 3.34s	remaining: 7m 54s
14:	learn: 1.2015143	total: 3.58s	remaining: 7m 54s
15:	learn: 1.1882318	total: 3.81s	remaining: 7m 53s
16:	learn: 1.1748458	total: 4.05s	remaining: 7m 52s
17:	learn: 1.1618699	total: 4.28s	remaining: 7m 51s
18:	learn: 1.1524285	total: 4.52s	remaining: 

<catboost.core.CatBoostClassifier at 0x1f372916f50>

In [30]:
predict = model.predict(X_test)
accuracy_score(predict, y_test)

0.6503948312993539

In [31]:
f1_score(predict, y_test, average='weighted')

0.6599951563625893

In [32]:
f1_score(predict, y_test, average='macro')

0.5515195537554225

Для инференса обучим на всём датасете

In [13]:
model = CatBoostClassifier(iterations=2000,
                           text_features=['MessageTextClean', 'l_syns'],
                           random_state=77)
model.fit(df_names.drop(['SentimentScore'], axis=1), df_names['SentimentScore'])

Learning rate set to 0.049353
0:	learn: 1.5578598	total: 193ms	remaining: 6m 25s
1:	learn: 1.5113189	total: 464ms	remaining: 7m 43s
2:	learn: 1.4713213	total: 736ms	remaining: 8m 9s
3:	learn: 1.4323855	total: 1s	remaining: 8m 19s
4:	learn: 1.4003995	total: 1.26s	remaining: 8m 21s
5:	learn: 1.3694337	total: 1.52s	remaining: 8m 23s
6:	learn: 1.3419164	total: 1.76s	remaining: 8m 22s
7:	learn: 1.3150299	total: 2.02s	remaining: 8m 22s
8:	learn: 1.2924827	total: 2.28s	remaining: 8m 24s
9:	learn: 1.2718785	total: 2.54s	remaining: 8m 26s
10:	learn: 1.2517169	total: 2.83s	remaining: 8m 32s
11:	learn: 1.2341977	total: 3.08s	remaining: 8m 31s
12:	learn: 1.2181197	total: 3.32s	remaining: 8m 27s
13:	learn: 1.2025321	total: 3.59s	remaining: 8m 28s
14:	learn: 1.1895764	total: 3.84s	remaining: 8m 28s
15:	learn: 1.1770162	total: 4.13s	remaining: 8m 31s
16:	learn: 1.1657274	total: 4.39s	remaining: 8m 32s
17:	learn: 1.1548166	total: 4.65s	remaining: 8m 31s
18:	learn: 1.1453954	total: 4.89s	remaining: 8m 

<catboost.core.CatBoostClassifier at 0x1f35ea316f0>

In [15]:
model.save_model('TSA_model2')