In [57]:
import pandas as pd
tr = pd.read_csv("crabs_train.csv")
tst = pd.read_csv("crabs_test.csv")
for df in [tr, tst]:
    df.colums = df.columns.str.lower()
    df.drop_duplicates(subset=['length', 'diameter', 'height', 'weight'], keep='first', inplace = True)
    df.rename(columns = 
              {
                  'shucked weight': 'shucked_weight',
                  'viscera weight': 'viscera_weight',
                  'shell weight': 'shell_weight'
              }, inplace = True)
    df['height'] = np.where(df['height']==0, 0.025,df['height'])
    df["volume"] = df["length"] * df["diameter"] * df['height']
    df['approx_density'] = df['weight'] / df['volume']
    scaler = MinMaxScaler()
    df['approx_density'] = scaler.fit_transform(df['approx_density'].to_numpy().reshape(-1, 1))
    df['bmi']= df['weight'] / (df['height'] ** 2)
    df['bmi'] = scaler.fit_transform(df['bmi'].to_numpy().reshape(-1, 1))
    #df['volume'] = scaler.fit_transform(df['volume'].to_numpy().reshape(-1, 1))
    df["water_loss"]=df["weight"] - df["shucked_weight"] - df['viscera_weight'] - df['shell_weight']
    df["water_loss"] = scaler.fit_transform(df['water_loss'].to_numpy().reshape(-1, 1))
y_tr = tr.age.values
tr = tr.drop(columns = ["age"])
tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60330 entries, 0 to 65999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             60330 non-null  object 
 1   length          60330 non-null  float64
 2   diameter        60330 non-null  float64
 3   height          60330 non-null  float64
 4   weight          60330 non-null  float64
 5   shucked_weight  60330 non-null  float64
 6   viscera_weight  60330 non-null  float64
 7   shell_weight    60330 non-null  float64
 8   volume          60330 non-null  float64
 9   approx_density  60330 non-null  float64
 10  bmi             60330 non-null  float64
 11  water_loss      60330 non-null  float64
dtypes: float64(11), object(1)
memory usage: 6.0+ MB



Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



In [54]:
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import cdist
from utils.utils import *

def objective(trial, X_tr, y_tr, X_val, y_val, euc_distances):
    '''
    function of objective loss(returns objective loss for log regrression) with suggested parameters
    '''
    k = trial.suggest_int('k', 10, 200)
    # создаем и обучаем модель на предложенных и фиксированных гиперпараметрах
    sorted_distances, indices = get_k_neighbors(euc_distances, k = k)
    sorted_distances_scaled = 1 / (sorted_distances + 1e-6)
    y_pred = knn_predict_regression(indices, y_tr, distances = sorted_distances_scaled).round()
    
    # предсказываем на валидации [/ кроссвалидации] и получаем скор - считаем метрику качества
    score = mean_absolute_error(y_pred, y_val)
    
    return score

In [55]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from functools import partial


def start_optimization(
    objective_func, # принимает trial, X_tr, y_tr, X_val, y_val, **other_objective_kwargs
    n_trials,
    n_jobs,
    X_tr, 
    y_tr, 
    X_val, 
    y_val,
    euc_distances,
    study_direction=None,
    sampler=None,
    features=None,
    **other_objective_kwargs
):

    obj_func = partial(objective_func, X_tr=X_tr, 
                       X_val=X_val, 
                       y_tr=y_tr, 
                       y_val=y_val, 
                      euc_distances = euc_distances)
    study = optuna.create_study(sampler=sampler, direction='minimize')
    study.optimize(obj_func, n_trials=n_trials, n_jobs= n_jobs)
    return study

In [58]:
from sklearn.model_selection import train_test_split
numeric_features = tr.select_dtypes(np.number).columns
indeces_I = (tr['sex'] == 'I').to_numpy()
indeces_not_I = (tr['sex'] != 'I').to_numpy()
tr_I = tr[tr['sex'] == 'I']
y_I = y_tr[indeces_I]
X_tr_I, X_val_I, y_tr_I, y_val_I = train_test_split(tr_I[numeric_features].values, y_I, test_size = 0.2)
euc_distances_I = cdist(
        X_val_I,
        X_tr_I,
    )
euc_distances_I.shape

(3793, 15172)

In [59]:
tpe_sampler = optuna.samplers.TPESampler(
        n_startup_trials= 7, # объем разведки. Всегда настраивайте!
        n_ei_candidates=13, # влияет на "точность шага"
)
study = start_optimization(objective, n_trials = 40, n_jobs = 1,tpe_sampler = tpe_sampler, X_tr = X_tr_I, 
                           y_tr = y_tr_I,
                          X_val = X_val_I,
                          y_val = y_val_I, 
                          euc_distances = euc_distances_I)

[I 2023-12-08 18:20:53,951] A new study created in memory with name: no-name-d6f4eb04-d142-4601-a27c-3598700b4c09
[I 2023-12-08 18:20:55,276] Trial 0 finished with value: 0.8821513313999473 and parameters: {'k': 44}. Best is trial 0 with value: 0.8821513313999473.
[I 2023-12-08 18:20:56,202] Trial 1 finished with value: 0.8787239651990509 and parameters: {'k': 45}. Best is trial 1 with value: 0.8787239651990509.
[I 2023-12-08 18:20:56,985] Trial 2 finished with value: 0.8800421829686265 and parameters: {'k': 43}. Best is trial 1 with value: 0.8787239651990509.
[I 2023-12-08 18:20:57,858] Trial 3 finished with value: 0.8607962035328236 and parameters: {'k': 164}. Best is trial 3 with value: 0.8607962035328236.
[I 2023-12-08 18:20:58,655] Trial 4 finished with value: 0.8623780648563143 and parameters: {'k': 120}. Best is trial 3 with value: 0.8607962035328236.
[I 2023-12-08 18:20:59,579] Trial 5 finished with value: 0.881624044292117 and parameters: {'k': 36}. Best is trial 3 with value:

In [60]:
optuna.visualization.plot_slice(study)

Возьмем k = 144 для I

In [62]:
numeric_features = tr.select_dtypes(np.number).columns
indeces_I = (tr['sex'] == 'I').to_numpy()
indeces_not_I = (tr['sex'] != 'I').to_numpy()
tr_not_I = tr[tr['sex'] != 'I']
y_not_I = y_tr[indeces_not_I]
X_tr_not_I, X_val_not_I, y_tr_not_I, y_val_not_I = train_test_split(tr_not_I[numeric_features].values, y_not_I, test_size = 0.2)
euc_distances_not_I = cdist(
        X_val_not_I,
        X_tr_not_I,
    )
euc_distances_I.shape

(3793, 15172)

In [63]:
tpe_sampler = optuna.samplers.TPESampler(
        n_startup_trials= 7, # объем разведки. Всегда настраивайте!
        n_ei_candidates=13, # влияет на "точность шага"
)
study = start_optimization(objective, n_trials = 20, n_jobs = 1,tpe_sampler = tpe_sampler, X_tr = X_tr_F, 
                           y_tr = y_tr_F,
                          X_val = X_val_F,
                          y_val = y_val_F, 
                          euc_distances = euc_distances_F)

[I 2023-12-08 18:21:51,223] A new study created in memory with name: no-name-a584a244-f154-4630-bd6c-b10807433e48
[I 2023-12-08 18:21:58,574] Trial 0 finished with value: 1.753294289897511 and parameters: {'k': 181}. Best is trial 0 with value: 1.753294289897511.
[I 2023-12-08 18:21:59,657] Trial 1 finished with value: 1.7725719863347975 and parameters: {'k': 24}. Best is trial 0 with value: 1.753294289897511.
[I 2023-12-08 18:22:00,734] Trial 2 finished with value: 1.7718399219131284 and parameters: {'k': 26}. Best is trial 0 with value: 1.753294289897511.
[I 2023-12-08 18:22:01,757] Trial 3 finished with value: 1.7523182040019523 and parameters: {'k': 152}. Best is trial 3 with value: 1.7523182040019523.
[I 2023-12-08 18:22:02,695] Trial 4 finished with value: 1.755734504636408 and parameters: {'k': 200}. Best is trial 3 with value: 1.7523182040019523.
[I 2023-12-08 18:22:03,643] Trial 5 finished with value: 1.7564665690580772 and parameters: {'k': 131}. Best is trial 3 with value: 1

In [64]:
optuna.visualization.plot_slice(study)

возьмем k = 1