In [1]:
import pandas as pd
from catboost import Pool, CatBoostRegressor
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))


In [3]:
train_data = pd.read_csv("../datasets/train_with_embeddings.csv")
test_data = pd.read_csv("../datasets/test_with_embeddings.csv")

In [4]:
train_data[['text_embedding', 'ratings']].head()

Unnamed: 0,text_embedding,ratings
0,"[0.8779181241989136, 0.3795791566371918, -0.15...",9
1,"[0.7185903787612915, 0.16849711537361145, -0.4...",8
2,"[1.2952933311462402, -0.03307562321424484, -0....",10
3,"[0.7549100518226624, 1.3669123649597168, -0.31...",7
4,"[0.5853791236877441, 0.4199213683605194, -1.60...",8


In [5]:
test_data[['text_embedding', 'ratings']].head()

Unnamed: 0,text_embedding,ratings
0,"[1.3787012100219727, -0.3298346698284149, -0.0...",10
1,"[1.3046091794967651, -0.2520449161529541, -0.5...",7
2,"[0.13798153400421143, 0.4489988684654236, -0.6...",9
3,"[0.4154549241065979, 0.4027397036552429, -1.48...",8
4,"[0.47656285762786865, -0.3982250988483429, -0....",8


In [6]:
import ast
def convert_embedding(embedding_str):
    return np.array(ast.literal_eval(embedding_str))

train_data['text_embedding'] = train_data['text_embedding'].apply(convert_embedding)

In [7]:
test_data['text_embedding'] = test_data['text_embedding'].apply(convert_embedding)

In [8]:
embeddings = np.array(train_data['text_embedding'].tolist())  # Преобразуем столбец в массив (n_samples, n_features)
embeddings.shape

(25000, 1024)

In [9]:
target = train_data['ratings'].astype(int)

In [2]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [1]:

select_class = SelectKBest(mutual_info_regression,k=1000)

matrix = select_class.fit_transform(embeddings,target)

matrix.shape

NameError: name 'embeddings' is not defined

In [11]:
test_embeddings = np.array(test_data['text_embedding'].tolist())  # Преобразуем столбец в массив (n_samples, n_features)
test_embeddings.shape

(25000, 1024)

In [12]:
matrix_test = select_class.transform(test_embeddings)
matrix_test.shape

(25000, 1000)

In [None]:
train_pool = Pool(matrix, target)
regressor = CatBoostRegressor(loss_function='RMSE', task_type='GPU', devices='0')

In [15]:
grid = {'learning_rate': [0.1],
        'depth': [10, 12 ,14],
        'l2_leaf_reg': [1, 3, 5]}

grid_search_result = regressor.grid_search(grid, X=matrix, y=target, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [29]:
y_pred = regressor.predict(matrix_test)

In [33]:
regressor.get_all_params()

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'RMSE',
 'iterations': 1000,
 'fold_permutation_block': 64,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Ordered',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '0',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 1,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'random_seed': 0,
 'depth': 6,
 'has_time': False,
 'fold_len_multiplier': 2,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'FeatureParallel',
 'bagging_temperatur

In [31]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
mae = mean_absolute_error(y_true=test_data['ratings'], y_pred=y_pred)
print("MAR: ", mae.round(3))
r2 = r2_score(y_true=test_data['ratings'], y_pred=y_pred)
print("R2: ", r2.round(3))
mse = mean_squared_error(y_true=test_data['ratings'], y_pred=y_pred)
print("MSE: ", mse.round(3))

MAR:  1.328
R2:  0.746
MSE:  3.092


In [32]:
regressor.score(matrix_test, test_data['ratings'])

0.7462251029588032

In [34]:
regressor.save_model("../files/regression_model.cbm")

In [10]:
import pickle
    
filename = "../files/select_class.pkl"
with open(filename, 'wb') as file:
    pickle.dump(select_class, file)