In [165]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperopt import hp, fmin, tpe, Trials
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import json
from tqdm import tqdm

# Загрузка данных из CSV
data = pd.read_csv('../inputs/Train.csv')

ohe_cols = ['Gender', 'Ever_Married', 'Graduated']

# Удаление столбца 'ID'
mapa = {'Male': 1,
        'Female': 0,
        'Yes':1,
        'No': 0}
data.drop('ID', axis=1, inplace=True)
data['Ever_Married'] = data['Ever_Married'].fillna('No')
data['Ever_Married'] = data['Ever_Married'].map(mapa)
data['Graduated'] = data['Graduated'].fillna('No')
data['Graduated'] = data['Graduated'].map(mapa)
data['Gender'] = data['Gender'].map(mapa)
data['Work_Experience'] = data['Work_Experience'].fillna(0.0)
data = data.dropna()
#dummies = pd.get_dummies(data[ohe_cols])
#data = data.drop(ohe_cols, axis=1)
#data = pd.concat([dummies, data], axis=1)

# Разделение на признаки и целевую переменную
X = data.drop('Segmentation', axis=1)
y = data['Segmentation']

# Замена пропущенных значений в данных
X = X.dropna()

# Создание словаря для преобразования букв в цифры
target_dict = {label: index for index, label in enumerate(y.unique())}
inverse_target_dict = {index: label for label, index in target_dict.items()}

# Преобразование целевой переменной из букв в цифры
y_encoded = y.map(target_dict)

# Разделение на обучающую и тестовую выборки
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Определение пространства поиска для оптимизации гиперпараметров
space = {
    'iterations': hp.quniform("iterations", 100, 1000, 20),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'depth': hp.quniform("depth", 3, 15, 1),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0.0, 2.0),
}

# Функция для оптимизации гиперпараметров
def objective(params):
    clf = CatBoostClassifier(
        eval_metric='Accuracy',
        loss_function='MultiClass',
        **params
    )

    clf.fit(X_train, y_train, cat_features=['Profession', 'Spending_Score', 'Var_1'],
            verbose=False)
    y_pred = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return 1-accuracy  # Минимизация функции, поэтому минус перед accuracy

# Поиск оптимальных гиперпараметров с помощью hyperopt
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            show_progressbar=True,
            verbose=True)

# Обучение модели с оптимальными гиперпараметрами
best_model = CatBoostClassifier(
    eval_metric='Accuracy',
    loss_function='MultiClass',
    **best
)
best_model.fit(X_train, y_train, cat_features=['Profession', 'Spending_Score', 'Var_1'],
               verbose=True)

# Создание папок для модели и логов
models_dir = '../models'
logs_dir = '../logs'
os.makedirs(models_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

# Сохранение обратного словаря в формате JSON
with open(os.path.join(models_dir, 'target_dict.json'), 'w') as f:
    json.dump(inverse_target_dict, f)

# Логирование CatBoost в файл
best_model.fit(Pool(X_train, y_train, cat_features=['Profession', 'Spending_Score', 'Var_1']),
               verbose=0, plot=True)

# Сохранение лучшей модели
best_model.save_model(os.path.join(models_dir, 'best_model.cbm'))


# Прогнозирование на новых данных
new_data = pd.DataFrame({
    'Gender': ['Female'],
    'Ever_Married': ['Yes'],
    'Age': [32],
    'Graduated': ['Yes'],
    'Profession': ['Engineer'],
    'Work_Experience': [5.0],
    'Spending_Score': ['High'],
    'Family_Size': [4.0],
    'Var_1': ['Cat_6']
})

# Преобразование категориальных признаков в новых данных
new_data_encoded = pd.get_dummies(new_data, columns=['Gender', 'Ever_Married', 'Graduated'])
new_data_encoded = new_data_encoded.reindex(columns=X.columns, fill_value=0)

prediction = best_model.predict(new_data_encoded)
predicted_labels = [inverse_target_dict[pred] for pred in prediction[0]]
print(predicted_labels)

  0%|                                                                           | 0/50 [00:04<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [112]:
tmp = best
tmp['iterations'] = 3000
best_model = CatBoostClassifier(**tmp,
                                eval_metric='Accuracy',
                                loss_function='MultiClass',
                                random_seed=42)

best_model.fit(X_train, y_train, cat_features=['Profession', 'Spending_Score', 'Var_1'],
               use_best_model=True,
               eval_set = (X_val, y_val),
               plot=True,
               verbose=0)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1b382207610>

In [115]:
from sklearn.metrics import f1_score

y_pred = best_model.predict(X_val)
accuracy_score(y_val, y_pred)
f1_score(y_val, y_pred, average='macro')
best_model.pl

0.544923998730029

In [160]:
pars = best_model.get_params()
best_model.predict_proba(X_val)
pars['iterations'] = 1000
pars

{'iterations': 1000,
 'learning_rate': 0.029253819407564413,
 'depth': 3.0,
 'l2_leaf_reg': 2.6384357636120477,
 'loss_function': 'MultiClass',
 'random_seed': 42,
 'eval_metric': 'Accuracy'}

In [206]:
class murka():
    def __init__(self, num_class, params):
        self.models = []
        for i in range(num_class):
            m = CatBoostClassifier(**params)
            self.models.append(m)
        self.num_class = num_class
    
    def fit(self, X_train, y_train, cat_features):
        self.new_y = pd.get_dummies(y_train).values.T
        print(self.new_y)
        for i in tqdm(range(self.num_class)):
            self.models[i].fit(X_train, self.new_y[i], cat_features=cat_features, verbose = 0)
        
    def predict_proba(self, X_test):
        y_pred = []
        for i in range(self.num_class):
            check = self.models[i].predict_proba(X_test)
            check = np.array([p[1] for p in check])
            y_pred.append(check)
        y_pred = np.array(y_pred).T
        return y_pred
    
    def predict(self, X_test):
        y_pred = self.predict_proba(X_test)
        y_pred = np.array([np.argmax(i) for i in y_pred])
        return y_pred

In [207]:
model = murka(4, pars)
model.fit(X_train, y_train, cat_features=['Profession', 'Spending_Score', 'Var_1'])
y_proba = model.predict_proba(X_val)

[[0 0 1 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [1 1 0 ... 0 0 0]]


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:17<00:00, 19.26s/it]


In [208]:
pd.get_dummies(y_train).values.sum(axis=1)
y_proba

array([[0.98429653, 0.02061488, 0.02604166, 0.01992708],
       [0.07850319, 0.15837406, 0.39317124, 0.24844031],
       [0.02667441, 0.125049  , 0.36463621, 0.58525259],
       ...,
       [0.0457836 , 0.20954014, 0.28272703, 0.2946614 ],
       [0.00712359, 0.05303194, 0.23441482, 0.77843487],
       [0.03850156, 0.24680445, 0.37997856, 0.33565664]])

In [209]:
y_pred_2 = np.array([np.argmax(i) for i in y_proba])
accuracy_score(y_val, y_pred_2)

0.5525446133509584

In [111]:
best

{'depth': 3.0,
 'iterations': 980.0,
 'l2_leaf_reg': 2.6384357636120477,
 'learning_rate': 0.029253819407564413}

In [84]:
y_pred

array([[0],
       [2],
       [3],
       ...,
       [2],
       [3],
       [2]], dtype=int64)

In [85]:
predicted_labels = [inverse_target_dict[pred[0]] for pred in y_pred]
print(predicted_labels)

['D', 'B', 'C', 'B', 'A', 'B', 'A', 'D', 'A', 'C', 'C', 'C', 'D', 'D', 'A', 'A', 'B', 'D', 'D', 'A', 'B', 'B', 'B', 'A', 'A', 'C', 'D', 'D', 'B', 'D', 'D', 'D', 'A', 'C', 'A', 'B', 'D', 'A', 'A', 'C', 'B', 'D', 'A', 'D', 'B', 'D', 'C', 'D', 'C', 'A', 'A', 'D', 'A', 'D', 'D', 'C', 'D', 'D', 'B', 'D', 'D', 'A', 'A', 'C', 'C', 'D', 'D', 'A', 'A', 'D', 'A', 'D', 'A', 'A', 'C', 'D', 'B', 'C', 'B', 'A', 'A', 'A', 'A', 'C', 'B', 'C', 'D', 'A', 'A', 'C', 'C', 'D', 'A', 'B', 'D', 'A', 'D', 'B', 'A', 'A', 'C', 'A', 'C', 'B', 'A', 'B', 'C', 'A', 'C', 'D', 'B', 'B', 'B', 'C', 'D', 'D', 'A', 'D', 'B', 'B', 'A', 'B', 'D', 'A', 'D', 'D', 'A', 'B', 'C', 'B', 'C', 'C', 'D', 'A', 'D', 'D', 'C', 'C', 'B', 'D', 'D', 'B', 'C', 'A', 'D', 'D', 'A', 'B', 'D', 'A', 'C', 'D', 'A', 'C', 'C', 'C', 'D', 'B', 'D', 'B', 'A', 'D', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'D', 'A', 'C', 'C', 'D', 'B', 'C', 'B', 'D', 'D', 'D', 'B', 'A', 'D', 'C', 'B', 'C', 'D', 'C', 'B', 'D', 'D', 'B', 'B', 'A', 'C', 'D', 'D', 'A',

In [121]:
# Загрузка данных из CSV
test = pd.read_csv('../inputs/Test.csv')

ohe_cols = ['Gender', 'Ever_Married', 'Graduated']

# Удаление столбца 'ID'
test.drop('ID', axis=1, inplace=True)
test['Ever_Married'] = test['Ever_Married'].fillna('No')
test['Ever_Married'] = test['Ever_Married'].map(mapa)
test['Graduated'] = test['Graduated'].fillna('No')
test['Graduated'] = test['Graduated'].map(mapa)
test['Gender'] = test['Gender'].map(mapa)
test['Work_Experience'] = test['Work_Experience'].fillna(0.0)

test = test.dropna()

In [123]:
test

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,0,1,36,1,Engineer,0.0,Low,1.0,Cat_6,B
1,1,1,37,1,Healthcare,8.0,Average,4.0,Cat_6,A
3,1,1,59,0,Executive,11.0,High,2.0,Cat_6,B
4,0,0,19,0,Marketing,0.0,Low,4.0,Cat_6,A
5,1,1,47,1,Doctor,0.0,High,5.0,Cat_4,C
...,...,...,...,...,...,...,...,...,...,...
2622,1,0,29,0,Healthcare,9.0,Low,4.0,Cat_6,B
2623,0,0,35,1,Doctor,1.0,Low,1.0,Cat_6,A
2624,0,0,53,1,Entertainment,0.0,Low,2.0,Cat_6,C
2625,1,1,47,1,Executive,1.0,High,5.0,Cat_4,C


In [125]:
X_test = test.drop('Segmentation', axis=1)
y_test = test['Segmentation']
y_test = [target_dict[i] for i in y_test]
test_set = Pool(X_test, y_test, cat_features=['Profession', 'Spending_Score', 'Var_1'])
y_pred = best_model.predict(test_set)

accuracy_score(y_test, y_pred)

0.33659730722154224

In [49]:
for i in range(len(y_test)):
    print(y_test[i], y_pred[i][0])

2 0
1 0
2 0
1 0
3 0
0 0
0 0
2 0
2 0
0 0
0 0
1 0
0 0
0 0
3 0
0 0
2 0
2 0
3 0
1 0
1 0
2 0
2 0
2 0
1 0
2 0
1 0
3 0
3 0
2 0
0 0
3 0
1 0
2 0
3 0
2 0
1 0
0 0
1 0
3 0
1 0
3 0
2 0
0 0
0 0
2 0
2 0
3 0
1 0
2 0
0 0
3 0
1 0
0 0
2 0
0 0
3 0
3 0
1 0
1 0
1 0
0 0
3 0
1 0
2 0
2 0
0 0
3 0
3 0
0 0
2 0
0 0
2 0
1 0
3 0
0 0
1 0
0 0
0 0
0 0
0 0
1 0
1 0
0 0
0 0
2 0
3 0
3 0
1 0
2 0
3 0
2 0
2 0
3 0
2 0
1 0
0 0
0 0
1 0
3 0
0 0
1 0
2 0
2 0
2 0
2 0
2 0
1 0
0 0
0 0
3 0
2 0
1 0
1 0
0 0
0 0
0 0
1 0
3 0
2 0
0 0
1 0
2 0
2 0
2 0
1 0
2 0
1 0
2 0
1 0
3 0
2 0
3 0
0 0
1 0
1 0
2 0
0 0
3 0
1 0
1 0
2 0
3 0
1 0
0 0
1 0
1 0
2 0
1 0
1 0
2 0
2 0
1 0
3 0
3 0
3 0
3 0
1 0
2 0
3 0
0 0
1 0
2 0
2 0
1 0
0 0
0 0
3 0
0 0
2 0
3 0
0 0
2 0
1 0
0 0
1 0
2 0
3 0
1 0
1 0
3 0
3 0
1 0
1 0
3 0
0 0
0 0
0 0
1 0
1 0
0 0
0 0
0 0
3 0
2 0
2 0
0 0
3 0
0 0
2 0
3 0
1 0
1 0
1 0
3 0
1 0
2 0
1 0
0 0
1 0
1 0
0 0
0 0
0 0
0 0
3 0
0 0
2 0
1 0
0 0
0 0
0 0
3 0
2 0
3 0
2 0
3 0
3 0
1 0
0 0
0 0
1 0
2 0
1 0
0 0
2 0
2 0
3 0
1 0
1 0
1 0
1 0
3 0
0 0
0 0
0 0
2 0
3 0
0 0
3 0


In [128]:
for i, j in zip(y_test, y_pred):
    print(i, j[0])

2 1
1 3
2 3
1 0
3 2
0 3
0 3
2 3
2 0
0 0
0 0
1 3
0 0
0 0
3 2
0 1
2 3
2 2
3 1
1 2
1 1
2 2
2 0
2 0
1 1
2 0
1 1
3 1
3 1
2 1
0 3
3 2
1 2
2 3
3 3
2 3
1 0
0 3
1 0
3 3
1 3
3 2
2 0
0 3
0 0
2 2
2 3
3 0
1 1
2 3
0 3
3 3
1 1
0 1
2 1
0 3
3 0
3 3
1 3
1 3
1 1
0 1
3 1
1 3
2 0
2 3
0 0
3 3
3 3
0 0
2 2
0 3
2 1
1 3
3 3
0 3
1 0
0 2
0 1
0 1
0 2
1 3
1 0
0 1
0 0
2 2
3 3
3 1
1 2
2 1
3 3
2 2
2 3
3 1
2 1
1 0
0 1
0 0
1 0
3 0
0 3
1 1
2 3
2 2
2 3
2 3
2 1
1 0
0 1
0 1
3 2
2 2
1 1
1 1
0 3
0 1
0 3
1 3
3 2
2 1
0 0
1 3
2 2
2 3
2 1
1 1
2 3
1 3
2 3
1 0
3 2
2 3
3 2
0 0
1 1
1 0
2 3
0 0
3 1
1 2
1 3
2 3
3 2
1 2
0 3
1 0
1 2
2 1
1 1
1 3
2 3
2 0
1 3
3 1
3 1
3 0
3 3
1 1
2 1
3 1
0 3
1 1
2 2
2 0
1 3
0 1
0 3
3 3
0 0
2 1
3 3
0 0
2 1
1 0
0 0
1 2
2 3
3 3
1 2
1 1
3 2
3 3
1 1
1 2
3 3
0 0
0 2
0 3
1 1
1 2
0 2
0 0
0 0
3 0
2 2
2 1
0 0
3 0
0 0
2 0
3 3
1 1
1 1
1 2
3 1
1 0
2 1
1 2
0 3
1 3
1 2
0 1
0 2
0 3
0 0
3 0
0 0
2 2
1 2
0 1
0 0
0 1
3 1
2 3
3 3
2 3
3 3
3 2
1 2
0 0
0 0
1 0
2 1
1 1
0 1
2 2
2 1
3 3
1 2
1 2
1 1
1 1
3 2
0 1
0 0
0 0
2 0
3 2
0 0
3 2


In [210]:
X_train

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
5325,0,0,27,0,Healthcare,0.0,Low,5.0,Cat_6
2422,0,0,53,1,Artist,1.0,Low,1.0,Cat_6
796,1,1,25,0,Entertainment,3.0,Average,2.0,Cat_3
6568,0,0,28,1,Engineer,0.0,Low,3.0,Cat_6
4018,0,1,81,1,Lawyer,1.0,Low,1.0,Cat_6
...,...,...,...,...,...,...,...,...,...
5529,0,1,30,0,Engineer,0.0,Average,2.0,Cat_6
5567,1,1,72,1,Executive,8.0,High,2.0,Cat_6
5743,1,1,72,1,Lawyer,1.0,High,2.0,Cat_6
922,1,1,41,0,Artist,1.0,Average,6.0,Cat_6
