In [2]:
import pandas as pd
from catboost import Pool, CatBoostRegressor
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))


In [4]:
train_data = pd.read_csv("../datasets/train_with_embeddings.csv")
test_data = pd.read_csv("../datasets/test_with_embeddings.csv")

In [5]:
train_data[['text_embedding', 'ratings']].head()

Unnamed: 0,text_embedding,ratings
0,"[0.8779181241989136, 0.3795791566371918, -0.15...",9
1,"[0.7185903787612915, 0.16849711537361145, -0.4...",8
2,"[1.2952933311462402, -0.03307562321424484, -0....",10
3,"[0.7549100518226624, 1.3669123649597168, -0.31...",7
4,"[0.5853791236877441, 0.4199213683605194, -1.60...",8


In [6]:
test_data[['text_embedding', 'ratings']].head()

Unnamed: 0,text_embedding,ratings
0,"[1.3787012100219727, -0.3298346698284149, -0.0...",10
1,"[1.3046091794967651, -0.2520449161529541, -0.5...",7
2,"[0.13798153400421143, 0.4489988684654236, -0.6...",9
3,"[0.4154549241065979, 0.4027397036552429, -1.48...",8
4,"[0.47656285762786865, -0.3982250988483429, -0....",8


In [7]:
import ast
def convert_embedding(embedding_str):
    return np.array(ast.literal_eval(embedding_str))

train_data['text_embedding'] = train_data['text_embedding'].apply(convert_embedding)

In [8]:
test_data['text_embedding'] = test_data['text_embedding'].apply(convert_embedding)

In [9]:
embeddings = np.array(train_data['text_embedding'].tolist())  # Преобразуем столбец в массив (n_samples, n_features)
embeddings.shape

(25000, 1024)

In [10]:
target = train_data['ratings'].astype(int)

In [11]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

select_class = SelectKBest(mutual_info_regression,k=1000)

matrix = select_class.fit_transform(embeddings,target)

matrix.shape

(25000, 1000)

In [12]:
test_embeddings = np.array(test_data['text_embedding'].tolist())  # Преобразуем столбец в массив (n_samples, n_features)
test_embeddings.shape

(25000, 1024)

In [13]:
matrix_test = select_class.transform(test_embeddings)
matrix_test.shape

(25000, 1000)

In [15]:
train_pool = Pool(matrix_test, target)
regressor = CatBoostRegressor(loss_function='RMSE', task_type='GPU', devices='0',learning_rate=.1, l2_leaf_reg=1, depth=6)

In [16]:
regressor.fit(train_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3.3274525	total: 112ms	remaining: 1m 51s
1:	learn: 3.2088307	total: 167ms	remaining: 1m 23s
2:	learn: 3.1046107	total: 228ms	remaining: 1m 15s
3:	learn: 3.0109574	total: 286ms	remaining: 1m 11s
4:	learn: 2.9308241	total: 344ms	remaining: 1m 8s
5:	learn: 2.8596239	total: 398ms	remaining: 1m 5s
6:	learn: 2.7941602	total: 455ms	remaining: 1m 4s
7:	learn: 2.7403794	total: 509ms	remaining: 1m 3s
8:	learn: 2.6896319	total: 563ms	remaining: 1m 2s
9:	learn: 2.6449755	total: 622ms	remaining: 1m 1s
10:	learn: 2.6043608	total: 676ms	remaining: 1m
11:	learn: 2.5691320	total: 731ms	remaining: 1m
12:	learn: 2.5360438	total: 784ms	remaining: 59.6s
13:	learn: 2.5059851	total: 864ms	remaining: 1m
14:	learn: 2.4798967	total: 918ms	remaining: 1m
15:	learn: 2.4541517	total: 971ms	remaining: 59.7s
16:	learn: 2.4305303	total: 1.03s	remaining: 59.4s
17:	learn: 2.4103114	total: 1.08s	remaining: 59s
18:	learn: 2.3903657	total: 1.13s	remaining: 58.6s
19:	learn: 2.3720596	total: 1.19s	remaining: 58.1s


<catboost.core.CatBoostRegressor at 0x27d160ae4b0>

In [22]:
grid = {'learning_rate': [0.5, 0.7],
        'depth': [6, 8],
        'l2_leaf_reg': [ 3, 5 ]}

grid_search_result = regressor.grid_search(grid, X=matrix_test, y=target, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 4.0227235	test: 4.0208568	best: 4.0208568 (0)	total: 64.7ms	remaining: 1m 4s
1:	learn: 2.9835760	test: 2.9977038	best: 2.9977038 (1)	total: 129ms	remaining: 1m 4s
2:	learn: 2.5880523	test: 2.6244456	best: 2.6244456 (2)	total: 196ms	remaining: 1m 5s
3:	learn: 2.4167757	test: 2.4630262	best: 2.4630262 (3)	total: 262ms	remaining: 1m 5s
4:	learn: 2.3487437	test: 2.4066901	best: 2.4066901 (4)	total: 329ms	remaining: 1m 5s
5:	learn: 2.3046642	test: 2.3682991	best: 2.3682991 (5)	total: 397ms	remaining: 1m 5s
6:	learn: 2.2707852	test: 2.3440861	best: 2.3440861 (6)	total: 455ms	remaining: 1m 4s
7:	learn: 2.2449179	test: 2.3243974	best: 2.3243974 (7)	total: 514ms	remaining: 1m 3s
8:	learn: 2.2240578	test: 2.3114847	best: 2.3114847 (8)	total: 580ms	remaining: 1m 3s
9:	learn: 2.2045768	test: 2.2943458	best: 2.2943458 (9)	total: 644ms	remaining: 1m 3s
10:	learn: 2.1873379	test: 2.2838259	best: 2.2838259 (10)	total: 709ms	remaining: 1m 3s
11:	learn: 2.1738693	test: 2.2748093	best: 2.274809

In [17]:
y_pred = regressor.predict(matrix_test)

In [25]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
mae = mean_absolute_error(y_true=test_data['ratings'], y_pred=y_pred)
print("MAR: ", mae.round(3))
r2 = r2_score(y_true=test_data['ratings'], y_pred=y_pred)
print("R2: ", r2.round(3))
mse = mean_squared_error(y_true=test_data['ratings'], y_pred=y_pred)
print("MSE: ", mse.round(3))

MAR:  1.158
R2:  0.833
MSE:  2.032


In [19]:
regressor.score(matrix_test, test_data['ratings'])

0.8332788892038083

In [27]:
regressor.save_model("../files/model.cbm")