Дана статистика пользователей adult.csv.  
Получите значения AUC для различных моделей и их параметров.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('adult.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def make_features_cat(df, cat_features):
    for each in cat_features:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df

Значения по стране рождения сильно скошены в сторону USA, поэтому просто введем флаг is_from_usa 

In [None]:
df['is_from_usa'] = df.apply(lambda x : 1 if x['native-country'] == 'United-States' else 0, axis=1)
df = df.drop(labels='native-country', axis=1)
df['is_from_usa'].value_counts(normalize=True)

In [None]:
df.income = df.income.map({'<=50K' : 0, '>50K' : 1})
df.income.value_counts()

In [None]:
df['is_male'] = df.gender.map({'Female' : 0, 'Male' : 1})
df = df.drop(labels='gender', axis=1)
df.is_male.value_counts()

У нас есть два близких признака: education и educational-num. Второй является по сути LabelEncoding для первого, поэтому уберем education

In [None]:
df = df.drop(labels='education', axis=1)

In [None]:
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race']
numeric_features =  ['fnlwgt', 'capital-gain', 'capital-loss']

df = make_features_cat(df, categorical_features)
df = df.drop(labels=categorical_features, axis=1)

In [None]:
y = df.income
df = df.drop('income', axis=1)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=df.columns)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=df.columns)

X_train.head()

Обучим модель в лоб и посмотрим на качество

In [None]:
model = LogisticRegression(C = 0.1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
roc_auc_score(model.predict(X_test), y_test)

Использование модели в лоб дало низкое качество. Теперь поиграемся с параметрами и моделями.  
В качестве моделей возьмем логистическую регрессию, случайный лес и SVM.  
Проверять будем по метрике ROC AUC с использованием GridSearch и Cross Validation.  
Тестовую выборку будем использовать в качестве валидационной.

#### Логистическая регрессия

In [95]:
clf_logreg = LogisticRegression()
logreg_params = {'C': [ 10 ** x for x in range(-2, 3, 1) ]
                ,'penalty' : [ 'l1', 'l2' ]
                }

#### RandomForest

In [134]:
clf_rf = RandomForestClassifier()
rf_params = {'n_estimators': [ 30, 50, 100, 300 ]
            ,'max_depth': [ 5, 10, 15]
            }

#### SVM

In [138]:
clf_svm = SVC()
svm_params = { 'C' :  [0.1 ] #, 1, 10] 
              ,'kernel' : ['linear'] #, 'poly', 'rbf', 'sigmoid', 'precomputed'] 
              ,'gamma' : [1 ] #, 2, 3, 'auto']
             }

Функция для расчета моделей

In [143]:
def calculate_models(models, cv=10, scoring='roc_auc'):
    for model in models:
        print('-----------------------------------------------------------')
        print('Model : {}'.format(model[0].__class__.__name__))
        print('Parameters : {}'.format(model[1]))
        gs = GridSearchCV( model[0], param_grid = model[1], cv=cv, scoring=scoring, n_jobs=-1 )
        gs.fit(X_train, y_train)
        print('Best params      : {}'.format(gs.best_params_))
        print('Train score      : {}'.format(gs.best_score_))
        print('Validation score : {}'.format(gs.score(X_test, y_test)))

In [None]:
models = [ #(clf_logreg, logreg_params)
          #(clf_rf, rf_params)
        (clf_svm, svm_params)
         ]

calculate_models(models=models, cv=10, scoring='roc_auc')

-----------------------------------------------------------
Model : SVC
Parameters : {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'gamma': [1, 2, 3, 'auto']}
