Дана статистика пользователей adult.csv.
Получите значения AUC для различных моделей и их параметров.
Используйте как минимум 3 различные модели классификации.

In [559]:
from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neighbors import KNeighborsClassifier as kn
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from itertools import product
import pandas as pd
import numpy as np
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [629]:
data = pd.read_csv("adult.csv")
target = 'income'

In [563]:
data[data.isnull()].count()
# пропусков нет

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [630]:
data2 = data.copy()

# LE
le = LabelEncoder()
for i in data.columns:
    if (len(du:= data[i].unique()) < 200) and (data[i].dtype == 'O'):
        print(f'LE   {i}\n{du}\n')
        data[i] = le.fit_transform(data[i])
    else:
        print(f'pass {i}\n{du}\n')

pass age
[25 38 28 44 18 34 29 63 24 55 65 36 26 58 48 43 20 37 40 72 45 22 23 54
 32 46 56 17 39 52 21 42 33 30 47 41 19 69 50 31 59 49 51 27 57 61 64 79
 73 53 77 80 62 35 68 66 75 60 67 71 70 90 81 74 78 82 83 85 76 84 89 88
 87 86]

LE   workclass
['Private' 'Local-gov' '?' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']

pass fnlwgt
[226802  89814 336951 ... 129912 255835 257302]

LE   education
['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' 'Doctorate' '5th-6th' 'Assoc-voc' '9th'
 '12th' '1st-4th' 'Preschool']

pass educational-num
[ 7  9 12 10  6 15  4 13 14 16  3 11  5  8  2  1]

LE   marital-status
['Never-married' 'Married-civ-spouse' 'Widowed' 'Divorced' 'Separated'
 'Married-spouse-absent' 'Married-AF-spouse']

LE   occupation
['Machine-op-inspct' 'Farming-fishing' 'Protective-serv' '?'
 'Other-service' 'Prof-specialty' 'Craft-repair' 'Adm-clerical'
 'Exec-managerial' 'Tech-suppo

In [631]:
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
48838,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
48839,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
48840,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


In [520]:
X = data.drop((Y:= data[target]).name, axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [584]:
models = [lr, rf, kn, dt]
results = pd.DataFrame(columns=['estimator', 'auc_score', 'params'])

In [585]:
# LogisticRegression

m = models[0]
print(estimator:= m.__name__)

solver_s = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter_s = 10**np.arange(3)

for par in product(solver_s, max_iter_s):
    params = {'solver':par[0], 'max_iter':par[1]}
    Y_pred_prob = m(**params).fit(X_train, Y_train).predict_proba(X_test)
    print('roc_auc_score', (auc_score:= roc_auc_score(Y_test, Y_pred_prob[:,1])), params)
    results = results.append({'estimator':estimator, 'auc_score':auc_score, 'params':params}, ignore_index=True)

LogisticRegression
roc_auc_score 0.5046840525510642 {'solver': 'newton-cg', 'max_iter': 1}
roc_auc_score 0.6272034845661056 {'solver': 'newton-cg', 'max_iter': 10}
roc_auc_score 0.8539178130811511 {'solver': 'newton-cg', 'max_iter': 100}
roc_auc_score 0.5046840525510642 {'solver': 'lbfgs', 'max_iter': 1}
roc_auc_score 0.5925905037424035 {'solver': 'lbfgs', 'max_iter': 10}
roc_auc_score 0.7268682511867268 {'solver': 'lbfgs', 'max_iter': 100}
roc_auc_score 0.5046840525510642 {'solver': 'liblinear', 'max_iter': 1}
roc_auc_score 0.6334106606133966 {'solver': 'liblinear', 'max_iter': 10}
roc_auc_score 0.7153305071720629 {'solver': 'liblinear', 'max_iter': 100}
roc_auc_score 0.5480200298942096 {'solver': 'sag', 'max_iter': 1}
roc_auc_score 0.5949316005028626 {'solver': 'sag', 'max_iter': 10}
roc_auc_score 0.6113766565403118 {'solver': 'sag', 'max_iter': 100}
roc_auc_score 0.5425488790525538 {'solver': 'saga', 'max_iter': 1}
roc_auc_score 0.5826103255257249 {'solver': 'saga', 'max_iter': 10}


In [596]:
# RandomForestClassifier

m = models[1]
print(estimator:= m.__name__)

n_estimators = 10**np.arange(3)
max_depth = 10**np.arange(3)
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt', 'log2']
class_weight = ['balanced', 'balanced_subsample']

for par in product(n_estimators, max_depth, criterion, max_features, class_weight):
    params = {'n_estimators':par[0], 
              'max_depth':par[1],
              'criterion':par[2],
              'max_features':par[3],
              'class_weight':par[4]}
    Y_pred_prob = m(**params).fit(X_train, Y_train).predict_proba(X_test)
    print('roc_auc_score', (auc_score:= roc_auc_score(Y_test, Y_pred_prob[:,1])), params,'\n')
    results = results.append({'estimator':estimator, 'auc_score':auc_score, 'params':params}, ignore_index=True)

RandomForestClassifier
roc_auc_score 0.705279987274185 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'auto', 'class_weight': 'balanced'} 

roc_auc_score 0.6413988880286547 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'auto', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.6413988880286547 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'sqrt', 'class_weight': 'balanced'} 

roc_auc_score 0.6413988880286547 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.727184864118211 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'log2', 'class_weight': 'balanced'} 

roc_auc_score 0.727184864118211 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'gini', 'max_features': 'log2', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.6316563014012415 {'n_estimators': 1, 'max_depth': 1, 'criterion': 'ent

roc_auc_score 0.9108376608959189 {'n_estimators': 10, 'max_depth': 10, 'criterion': 'entropy', 'max_features': 'auto', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.9105958990498395 {'n_estimators': 10, 'max_depth': 10, 'criterion': 'entropy', 'max_features': 'sqrt', 'class_weight': 'balanced'} 

roc_auc_score 0.9114392832464053 {'n_estimators': 10, 'max_depth': 10, 'criterion': 'entropy', 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.91089753925204 {'n_estimators': 10, 'max_depth': 10, 'criterion': 'entropy', 'max_features': 'log2', 'class_weight': 'balanced'} 

roc_auc_score 0.911798779481392 {'n_estimators': 10, 'max_depth': 10, 'criterion': 'entropy', 'max_features': 'log2', 'class_weight': 'balanced_subsample'} 

roc_auc_score 0.8824136677705119 {'n_estimators': 10, 'max_depth': 100, 'criterion': 'gini', 'max_features': 'auto', 'class_weight': 'balanced'} 

roc_auc_score 0.8828562802384443 {'n_estimators': 10, 'max_depth': 100, 'criterio

In [613]:
# KNeighborsClassifier

m = models[2]
print(estimator:= m.__name__)

n_neighbors = range(2, 55, 10)
weights =  ['uniform', 'distance']
algorithm = ['ball_tree', 'kd_tree', 'brute']

for par in product(n_neighbors, weights, algorithm):
    params = {'n_neighbors':par[0],
              'weights':par[1],
              'algorithm':par[2],}
    Y_pred_prob = m(**params).fit(X_train, Y_train).predict_proba(X_test)
    print('roc_auc_score', (auc_score:= roc_auc_score(Y_test, Y_pred_prob[:,1])), params,'\n')

    results = results.append({'estimator':estimator, 'auc_score':auc_score, 'params':params}, ignore_index=True)

KNeighborsClassifier
roc_auc_score 0.6720979572223095 {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'ball_tree'} 

roc_auc_score 0.6719661921802058 {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'kd_tree'} 

roc_auc_score 0.6716118208391351 {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'brute'} 

roc_auc_score 0.6791642312953053 {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'ball_tree'} 

roc_auc_score 0.6790359707762472 {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'kd_tree'} 

roc_auc_score 0.6787329991065099 {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'brute'} 

roc_auc_score 0.6707349238558837 {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'ball_tree'} 

roc_auc_score 0.6707349238558837 {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'kd_tree'} 

roc_auc_score 0.670801195768385 {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'brute'} 

roc_auc_score 0.6972523559564406 {'n_neighbors': 12, 'weights': 'distance'

In [604]:
# DecisionTreeClassifier

m = models[3]
print(estimator:= m.__name__)

max_depth = 10**np.arange(3)
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt', 'log2']
splitter = ['best', 'random']

for par in product(splitter, max_depth, criterion, max_features):
    params = {'splitter':par[0],
              'max_depth':par[1],
              'criterion':par[2],
              'max_features':par[3]}
    Y_pred_prob = m(**params).fit(X_train, Y_train).predict_proba(X_test)
    print('roc_auc_score', (auc_score:= roc_auc_score(Y_test, Y_pred_prob[:,1])), params,'\n')

    results = results.append({'estimator':estimator, 'auc_score':auc_score, 'params':params}, ignore_index=True)

DecisionTreeClassifier
roc_auc_score 0.705279987274185 {'splitter': 'best', 'max_depth': 1, 'criterion': 'gini', 'max_features': 'auto'} 

roc_auc_score 0.6569114269106471 {'splitter': 'best', 'max_depth': 1, 'criterion': 'gini', 'max_features': 'sqrt'} 

roc_auc_score 0.5870908519588476 {'splitter': 'best', 'max_depth': 1, 'criterion': 'gini', 'max_features': 'log2'} 

roc_auc_score 0.5614289128728341 {'splitter': 'best', 'max_depth': 1, 'criterion': 'entropy', 'max_features': 'auto'} 

roc_auc_score 0.727184864118211 {'splitter': 'best', 'max_depth': 1, 'criterion': 'entropy', 'max_features': 'sqrt'} 

roc_auc_score 0.727184864118211 {'splitter': 'best', 'max_depth': 1, 'criterion': 'entropy', 'max_features': 'log2'} 

roc_auc_score 0.8885188735020929 {'splitter': 'best', 'max_depth': 10, 'criterion': 'gini', 'max_features': 'auto'} 

roc_auc_score 0.8795262422453949 {'splitter': 'best', 'max_depth': 10, 'criterion': 'gini', 'max_features': 'sqrt'} 

roc_auc_score 0.8880087455812236 

In [614]:
results.sort_values(by='auc_score', ascending=False)

Unnamed: 0,estimator,auc_score,params
212,RandomForestClassifier,0.915551,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
208,RandomForestClassifier,0.915356,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
218,RandomForestClassifier,0.915274,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
217,RandomForestClassifier,0.915194,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
207,RandomForestClassifier,0.915174,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
...,...,...,...
253,DecisionTreeClassifier,0.547428,"{'splitter': 'random', 'max_depth': 1, 'criter..."
12,LogisticRegression,0.542549,"{'solver': 'saga', 'max_iter': 1}"
6,LogisticRegression,0.504684,"{'solver': 'liblinear', 'max_iter': 1}"
3,LogisticRegression,0.504684,"{'solver': 'lbfgs', 'max_iter': 1}"


In [622]:
best_result = pd.DataFrame()
for es in results.estimator.unique():
    best_result = best_result.append(results[results.estimator == es].sort_values(by='auc_score', ascending=False).head(1), ignore_index = True)

In [627]:
best_result.sort_values(by='auc_score', ascending=False)

Unnamed: 0,estimator,auc_score,params
1,RandomForestClassifier,0.915551,"{'n_estimators': 100, 'max_depth': 10, 'criter..."
2,DecisionTreeClassifier,0.893103,"{'splitter': 'best', 'max_depth': 10, 'criteri..."
0,LogisticRegression,0.853918,"{'solver': 'newton-cg', 'max_iter': 100}"
3,KNeighborsClassifier,0.698599,"{'n_neighbors': 8, 'weights': 'distance', 'alg..."
