In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import pickle

In [21]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

In [22]:
name = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [23]:
dataset = pd.read_csv(url, names = name)

In [24]:
dataset

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [25]:
X = dataset[['age', 'education-num']]
y = dataset['income']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, random_state=42)

In [27]:
categorical_features_indices = np.where(X.dtypes != np.float)[0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X.dtypes != np.float)[0]


In [28]:
from catboost import CatBoostClassifier, Pool, cv
cbc_model = CatBoostClassifier(custom_loss=['Accuracy'], random_seed=42, logging_level='Silent')

In [29]:
cbc_model.fit(X_train, y_train, 
              cat_features=categorical_features_indices, 
              eval_set=(X_validation, y_validation), plot=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [30]:
cbc_model.best_score_

{'learn': {'Accuracy': 0.8072072072072072, 'Logloss': 0.40892711198297155},
 'validation': {'Accuracy': 0.7922859599557794,
  'Logloss': 0.43202154793650654}}

In [31]:
cv_params = cbc_model.get_params()
cv_params.update({'loss_function': 'Logloss'})
print(cv_params)

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Accuracy'], 'loss_function': 'Logloss'}


In [32]:
cv_data = cv(Pool(X_train, y_train, cat_features=categorical_features_indices), 
    cv_params,
    fold_count=5,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [33]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.79±0.01 on step 215


In [34]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.7922601956292631


In [35]:
predictions = cbc_model.predict(X_train)
predictions_probs = cbc_model.predict_proba(X_train)
print(predictions[:10])
print(predictions_probs[:10])

[' <=50K' ' <=50K' ' <=50K' ' >50K' ' >50K' ' <=50K' ' <=50K' ' <=50K'
 ' <=50K' ' <=50K']
[[0.72955395 0.27044605]
 [0.94744397 0.05255603]
 [0.88162971 0.11837029]
 [0.42864895 0.57135105]
 [0.41180465 0.58819535]
 [0.63870771 0.36129229]
 [0.79822972 0.20177028]
 [0.94257133 0.05742867]
 [0.96210083 0.03789917]
 [0.98203672 0.01796328]]


# Save and Load RF model¶

In [37]:
Pkl_Filename = "cbc_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cbc_model, file)

In [38]:
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)
Pickled_LR_Model

<catboost.core.CatBoostClassifier at 0x7fba1d216340>