In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

In [2]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

In [4]:
data = pd.read_csv('boston.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [6]:
### Ваш код ###

features = list(data.columns)
features.remove('MEDV')

X, y = data[features], data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [45]:
### Ваш код ###

def get_r2_score(model):  
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)    
    return r2_score(y_test, y_pred)


r2_lr = get_r2_score(LinearRegression())
r2_ridge = get_r2_score(Ridge())
r2_lasso = get_r2_score(Lasso())


results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]

print(
    f'r2 LinearRegression - {r2_lr}',
    f'r2 Ridge - {r2_ridge}',
    f'r2 Lasso - {r2_lasso}',
    sep='\n'
)


r2 LinearRegression - 0.6684825753971597
r2 Ridge - 0.6659608075261694
r2 Lasso - 0.6668687223368214


3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [46]:
alpha_params = {'alpha': list(np.logspace(-5, 5, num=11))}

alpha_params

{'alpha': [1e-05,
  0.0001,
  0.001,
  0.01,
  0.1,
  1.0,
  10.0,
  100.0,
  1000.0,
  10000.0,
  100000.0]}

In [47]:
### Ваш код ###


# GridSearchCV
model = GridSearchCV(Ridge(), alpha_params)
model.fit(X_train, y_train)
print(f'ridge GS best params {model.best_params_}')

y_pred = model.best_estimator_.predict(X_test)    
r2_ridge_grid_search = r2_score(y_test, y_pred)

model = GridSearchCV(Lasso(), alpha_params)
model.fit(X_train, y_train)
print(f'lasso GS best params {model.best_params_}')

y_pred = model.best_estimator_.predict(X_test)    
r2_lasso_grid_search = r2_score(y_test, y_pred)

#RidgeCv
model = RidgeCV(alphas=np.logspace(-5, 5, num=11))
model.fit(X_train, y_train)
print(f'RidgeCV best params {model.alpha_}')

y_pred = model.predict(X_test)    
r2_ridge_cv = r2_score(y_test, y_pred)

#LassoCv
model = LassoCV(alphas=np.logspace(-5, 5, num=11))
model.fit(X_train, y_train)
print(f'LassoCV best params {model.alpha_}\n')

y_pred = model.predict(X_test)    
r2_lasso_cv = r2_score(y_test, y_pred)

print(
    f'r2 ridge grid - {r2_ridge_grid_search}', 
    f'r2 lasso grid - {r2_lasso_grid_search}',
    f'r2 ridge RidgeCV - {r2_ridge_cv}',
    f'r2 lasso LassoCV - {r2_lasso_cv}', 
    sep='\n'
)

results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]
results_regression

ridge GS best params {'alpha': 1e-05}
lasso GS best params {'alpha': 1e-05}
RidgeCV best params 0.01
LassoCV best params 1e-05

r2 ridge grid - 0.6684825680074256
r2 lasso grid - 0.6684829595885678
r2 ridge RidgeCV - 0.6684746105287571
r2 lasso LassoCV - 0.6684829595885678


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483


*в сравнении с обучением с дефолтными параметрами при использовании подбора гипер параметров наблюдается небольшое улучшение r2 score моделей Ridge и Lasso, при этом простая модель линейной регерссии показала этот же результат*

4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [48]:
### Ваш код ###

def get_pipeline_scaler_r2_score(scaler, model):  

    pipe = make_pipeline(scaler, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return r2_score(y_test, y_pred)

r2_ridge_standart_scaler = get_pipeline_scaler_r2_score(StandardScaler(), Ridge())
r2_ridge_min_max_scaler = get_pipeline_scaler_r2_score(MinMaxScaler(), Ridge())
r2_lasso_standart_scaler = get_pipeline_scaler_r2_score(StandardScaler(), Lasso())
r2_lasso_min_max_scaler = get_pipeline_scaler_r2_score(MinMaxScaler(), Lasso())

print(
    f'r2 ridge standart scaler - {r2_ridge_standart_scaler}', 
    f'r2 lasso standart scaler - {r2_lasso_standart_scaler}',
    f'r2 ridge min max scaler - {r2_ridge_min_max_scaler}',
    f'r2 lasso min max scaler - {r2_lasso_min_max_scaler}', 
    sep='\n'
)

results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]
results_regression

r2 ridge standart scaler - 0.6681901076774428
r2 lasso standart scaler - 0.6240447523478461
r2 ridge min max scaler - 0.6762207658974593
r2 lasso min max scaler - 0.2573921442545195


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


*масштабирование с использованием MinMaxScaler без подбора коэфф регуляризации плохо работает с Lasso, наблюдаем сильное падение качества модели, с StandardScaler Lasso также ухудшил качество. Ridge c использованием масштабирования не показал сколько нибудь значимого прироста или падения качества*

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [49]:
### Ваш код ###

alphas = np.logspace(-5, 5, num=11)

def get_scaler_params_r2_score(scaler, model): 
    pipe = make_pipeline(scaler, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return r2_score(y_test, y_pred)

r2_ridge_standart_scaler_cv = get_pipeline_scaler_r2_score(StandardScaler(), RidgeCV(alphas=alphas))
r2_ridge_min_max_scaler_cv = get_pipeline_scaler_r2_score(MinMaxScaler(), RidgeCV(alphas=alphas))
r2_lasso_standart_scaler_cv = get_pipeline_scaler_r2_score(StandardScaler(), LassoCV(alphas=alphas))
r2_lasso_min_max_scaler_cv = get_pipeline_scaler_r2_score(MinMaxScaler(), LassoCV(alphas=alphas))

print(
    f'r2 ridge regular standart scaler - {r2_ridge_standart_scaler_cv}', 
    f'r2 lasso regular standart scaler - {r2_lasso_standart_scaler_cv}',
    f'r2 ridge regular min max scaler - {r2_ridge_min_max_scaler_cv}',
    f'r2 lasso regular min max scaler - {r2_lasso_min_max_scaler_cv}', 
    sep='\n'
)

results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]
results_regression

r2 ridge regular standart scaler - 0.6657305340616751
r2 lasso regular standart scaler - 0.6684821312777706
r2 ridge regular min max scaler - 0.6697651112858871
r2 lasso regular min max scaler - 0.668483656442819


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


*подбор коэффициента регуляризации исправил снижение качества Lasso в предыдущем примере, но значимого прироста качества не дал ни по одной модели*

6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [50]:
### Ваш код ###
 
def poly_scaler_r2_score(scaler, model):    
    poly = PolynomialFeatures()
    
    pipe = make_pipeline(scaler, poly, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return r2_score(y_test, y_pred)


r2_ridge_standart_scaler_poly = poly_scaler_r2_score(StandardScaler(), Ridge())
r2_ridge_min_max_scaler_poly = poly_scaler_r2_score(MinMaxScaler(), Ridge())
r2_lasso_standart_scaler_poly = poly_scaler_r2_score(StandardScaler(), Lasso())
r2_lasso_min_max_scaler_poly = poly_scaler_r2_score(MinMaxScaler(), Lasso())

print(
    f'r2 ridge regular standart scaler polynomial - {r2_ridge_standart_scaler_poly}', 
    f'r2 lasso regular standart scaler polynomial - {r2_lasso_standart_scaler_poly}',
    f'r2 ridge regular min max scaler polynomial - {r2_ridge_min_max_scaler_poly}',
    f'r2 lasso regular min max scaler polynomial - {r2_lasso_min_max_scaler_poly}', 
    sep='\n'
)

results_regression.loc[15] = ['Ridge_StandardScaler_Poly', 'task6', r2_ridge_standart_scaler_poly]
results_regression.loc[16] = ['Ridge_MinMaxScaler_Poly', 'task6', r2_ridge_min_max_scaler_poly]
results_regression.loc[17] = ['Lasso_StandardScaler_Poly', 'task6', r2_lasso_standart_scaler_poly]
results_regression.loc[18] = ['Lasso_MinMaxScaler_Poly', 'task6', r2_lasso_min_max_scaler_poly]
results_regression

r2 ridge regular standart scaler polynomial - 0.8171359789657121
r2 lasso regular standart scaler polynomial - 0.7322738282708989
r2 ridge regular min max scaler polynomial - 0.8298862467502748
r2 lasso regular min max scaler polynomial - 0.2611262741735658


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


*снова сильно просело качество на комбинации Lasso и MinMaxScaler. В остальных комбинациях видим значительный прирост качества. Наибольший прирост дала комбинация Ridge и MinMaxScaler. Прирост качества при использовании PolynomialFeatures по большинству комбинаций может говорить о нелинейной зависимости признаков. Возможно этот факт не давал нам повысить качество в предыдущих экспериментах*

7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [51]:
### Ваш код ###

r2_ridge_standart_scaler_poly_cv = poly_scaler_r2_score(StandardScaler(), RidgeCV(alphas=alphas))
r2_ridge_min_max_scaler_poly_cv = poly_scaler_r2_score(MinMaxScaler(), RidgeCV(alphas=alphas))
r2_lasso_standart_scaler_poly_cv = poly_scaler_r2_score(StandardScaler(), LassoCV(alphas=alphas))
r2_lasso_min_max_scaler_poly_cv = poly_scaler_r2_score(MinMaxScaler(), LassoCV(alphas=alphas))

print(
    f'r2 ridge regular standart scaler polynomial cv - {r2_ridge_standart_scaler_poly_cv}', 
    f'r2 lasso regular standart scaler polynomial cv - {r2_lasso_standart_scaler_poly_cv}',
    f'r2 ridge regular min max scaler polynomial cv - {r2_ridge_min_max_scaler_poly_cv}',
    f'r2 lasso regular min max scaler polynomial cv - {r2_lasso_min_max_scaler_poly_cv}', 
    sep='\n'
)

results_regression.loc[19] = ['Ridge_StandardScaler_Poly_CV', 'task7', r2_ridge_standart_scaler_poly_cv]
results_regression.loc[20] = ['Ridge_MinMaxScaler_Poly_CV', 'task7', r2_ridge_min_max_scaler_poly_cv]
results_regression.loc[21] = ['Lasso_StandardScaler_Poly_CV', 'task7', r2_lasso_standart_scaler_poly_cv]
results_regression.loc[22] = ['Lasso_MinMaxScaler_Poly_CV', 'task7', r2_lasso_min_max_scaler_poly_cv]
results_regression

r2 ridge regular standart scaler polynomial cv - 0.8187344606117388
r2 lasso regular standart scaler polynomial cv - 0.8128138856150267
r2 ridge regular min max scaler polynomial cv - 0.8501402125961249
r2 lasso regular min max scaler polynomial cv - 0.8397022991682287


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


*подбор параметров регуляризации снова улучшил качество моделли Lasso, при том MinMaxScaler отработал лучше StandartScaler. Так же MinMaxScaler показал лучшее чем StandartScaler качество на моделе Ridge. По итогу эксперимента можно сделать вывод, что подбор параметров сильно улучшает качество Lasso. Лучший результат дает комбинация - MinMaxScaler + PolynomialFeatures + Ridge*

8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

In [52]:
%%time
### Ваш код ###

pipe = Pipeline([
            ('scaler', StandardScaler()), 
            ('polynom', PolynomialFeatures()), 
            ('model', Lasso())
        ])


params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'polynom__degree': [1, 2, 3, 4, 5],
    'model__alpha': alphas,  
    'model': [Ridge(), Lasso()] 
}

grid = GridSearchCV(pipe, params)

grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2_score(y_pred, y_test)

best_params = grid.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = r2_score(y_test, y_pred)
print(r2_best_model)
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]

Параметры лучшей модели:
 {'model': Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001), 'model__alpha': 1.0, 'polynom__degree': 5, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.85988059063973
Wall time: 5min 49s


In [53]:
%%time
### Ваш код ###
# Подбор коэффициента регуляризации с помощью LassoCV и RidgeCV работает значительно быстрее GreadSearchCV

pipe = Pipeline([
            ('scaler', StandardScaler()), 
            ('polynom', PolynomialFeatures()), 
            ('model', LassoCV(alphas))
        ])


params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'polynom__degree': [1, 2, 3, 4, 5],
    'model': [RidgeCV(alphas), LassoCV(alphas)] 
}

grid = GridSearchCV(pipe, params)

grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
r2_score(y_pred, y_test)

best_params = grid.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = r2_score(y_test, y_pred)
print(r2_best_model)

Параметры лучшей модели:
 {'model': RidgeCV(alphas=array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05]),
        cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
        scoring=None, store_cv_values=False), 'polynom__degree': 5, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.8598805906397202
Wall time: 5.33 s


*реализованный pipeline и подбор параметров подтвердил вывод о лучшей комбинации, полученный в предыдущем эксперименте. Качество модели при этом немного улучшено, тк добавлен подбор степени полинома (он отличается от степени по умолчанию). Путем эксперимента так же выявлено, что подбор параметра регуляризации с помощью RidgeCV и LassoCV работает значительно быстрее, чем подбор с помощью GreadSearchCV, а качество дает такое же, поэтому предпочтительнее использовать их.*

In [54]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668475
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


http://archive.ics.uci.edu/ml/datasets/Adult

In [3]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from concurrent.futures import ThreadPoolExecutor

In [4]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [5]:
### Ваш код ###
X = data.drop('class', axis=1)
y = data['class'].map({'<=50K': 1, '>50K': 0}).astype(int)
y.unique()

array([1, 0])

In [6]:
y.shape

(48842,)

In [7]:
y.value_counts()

class
1    37155
0    11687
Name: count, dtype: int64

10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [8]:
### Ваш код ###
y_true = y

y_pred = [y.mode().iloc[0]] * len(y_true)

assert(len(y_true) == len(y_pred))

f1_most_frequent = f1_score(y_true, y_pred)
acc_most_frequent = accuracy_score(y_true, y_pred)

results_classification.loc[0] = ['Most Frequent class', 'task10', f1_most_frequent, acc_most_frequent]
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718


11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [9]:
### Ваш код ###
X.isnull().sum()


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [11]:
dtypes = X.dtypes
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)
X = pd.DataFrame(imputer.transform(X), columns=X.columns).astype(dtypes)
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).

In [12]:
X.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [13]:
tasks = 0
tasks+=1
### Ваш код ###
number_columns =  X.select_dtypes('int64').columns
category_columns = X.select_dtypes('object').columns

assert(len(number_columns) + len(category_columns) == X.shape[1])

13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [13]:
def get_cv_f1_accuracy_score(model, x_data, y_data):
    pipe_categ = Pipeline([
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ])

    pipe_number = Pipeline([
                ('scaler', MinMaxScaler())
            ])

    transformer = ColumnTransformer([
        ('category', pipe_categ, category_columns),
        ('number', pipe_number, number_columns)
    ])

    pipe_process = Pipeline([
        ('processor', transformer),
        ('model', model)
    ])

    f1_score = cross_val_score(pipe_process, x_data, y_data, scoring='f1').mean()
    accuracy_score = cross_val_score(pipe_process, x_data, y_data, scoring='accuracy').mean()
    
    return accuracy_score, f1_score

In [16]:
%%time
### Ваш код ###

with ThreadPoolExecutor(max_workers=3) as executor:
    lr_future = executor.submit(get_cv_f1_accuracy_score, LogisticRegression(), X, y)
    knn_future = executor.submit(get_cv_f1_accuracy_score, KNeighborsClassifier(), X, y)
    svc_future = executor.submit(get_cv_f1_accuracy_score, LinearSVC(), X, y)

    lr_scores = lr_future.result()
    knn_scores = knn_future.result()
    svc_scores = svc_future.result()

f1_LR = lr_scores[1]
acc_LR = lr_scores[0]
f1_KNN = knn_scores[1]
acc_KNN = knn_scores[0]
f1_SVM = svc_scores[1]
acc_SVM = svc_scores[0]
results_classification.loc[1] = ['LogisticRegression', 'task13', f1_LR, acc_LR]
results_classification.loc[2] = ['KNeighborsClassifier', 'task13', f1_KNN, acc_KNN]
results_classification.loc[3] = ['LinearSVC', 'task13', f1_SVM, acc_SVM]


Wall time: 3min 47s


In [17]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.905,0.851153
2,KNeighborsClassifier,task13,0.886993,0.824782
3,LinearSVC,task13,0.906322,0.852914


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [18]:
def get_si_cv_f1_accuracy_score(model, x_data, y_data):
    pipe_categ = Pipeline([  
                ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ])

    pipe_number = Pipeline([
                ('scaler', MinMaxScaler())
            ])

    transformer = ColumnTransformer([
        
        ('category', pipe_categ, category_columns),
        ('number', pipe_number, number_columns)
    ])

    pipe_process = Pipeline([
       
        ('processor', transformer),
        ('model', model)
    ])

    f1_score = cross_val_score(pipe_process, x_data, y_data, scoring='f1').mean()
    accuracy_score = cross_val_score(pipe_process, x_data, y_data, scoring='accuracy').mean()
    
    return accuracy_score, f1_score

In [19]:
%%time
### Ваш код ###

with ThreadPoolExecutor(max_workers=3) as executor:
    lr_future = executor.submit(get_si_cv_f1_accuracy_score, LogisticRegression(), X, y)
    knn_future = executor.submit(get_si_cv_f1_accuracy_score, KNeighborsClassifier(), X, y)
    svc_future = executor.submit(get_si_cv_f1_accuracy_score, LinearSVC(), X, y)

    lr_scores = lr_future.result()
    knn_scores = knn_future.result()
    svc_scores = svc_future.result()

f1_LR = lr_scores[1]
acc_LR = lr_scores[0]
f1_KNN = knn_scores[1]
acc_KNN = knn_scores[0]
f1_SVM = svc_scores[1]
acc_SVM = svc_scores[0]

results_classification.loc[4] = ['LogisticRegression_impute', 'task14', f1_LR, acc_LR]
results_classification.loc[5] = ['KNeighborsClassifier_impute', 'task14', f1_KNN, acc_KNN]
results_classification.loc[6] = ['LinearSVC_impute', 'task14', f1_SVM, acc_SVM]

Wall time: 4min 55s


In [21]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.905,0.851153
2,KNeighborsClassifier,task13,0.886993,0.824782
3,LinearSVC,task13,0.906322,0.852914
4,LogisticRegression_impute,task14,0.904876,0.850866
5,KNeighborsClassifier_impute,task14,0.887343,0.825253
6,LinearSVC_impute,task14,0.905422,0.851255


15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

In [24]:
%%time
### Ваш код ###
data1 = data.copy()
data1.replace('?', np.nan, inplace=True)
data1.dropna(inplace=True)

assert(data1.isnull().sum().sum() == 0)

X_clear = data1.drop('class', axis=1)
y_clear = data1['class'].map({'<=50K': 1, '>50K': 0}).astype(int)


with ThreadPoolExecutor(max_workers=3) as executor:
    lr_future = executor.submit(get_cv_f1_accuracy_score, LogisticRegression(), X_clear, y_clear)
    knn_future = executor.submit(get_cv_f1_accuracy_score, KNeighborsClassifier(), X_clear, y_clear)
    svc_future = executor.submit(get_cv_f1_accuracy_score, LinearSVC(), X_clear, y_clear)

    lr_scores = lr_future.result()
    knn_scores = knn_future.result()
    svc_scores = svc_future.result()


f1_LR_del_missings = lr_scores[1]
acc_LR_del_missings = lr_scores[0]
f1_KNN_del_missings = knn_scores[1]
acc_KNN_del_missings = knn_scores[0]
f1_SVM_del_missings = svc_scores[1]
acc_SVM_del_missings = svc_scores[0]

results_classification.loc[7] = ['LogisticRegression_delete_missings', 'task15', f1_LR_del_missings, acc_LR_del_missings]
results_classification.loc[8] = ['KNeighborsClassifier_delete_missings', 'task15', f1_KNN_del_missings, acc_KNN_del_missings]
results_classification.loc[9] = ['LinearSVC_delete_missings', 'task15', f1_SVM_del_missings, acc_SVM_del_missings]


Wall time: 3min 11s


In [25]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.905,0.851153
2,KNeighborsClassifier,task13,0.886993,0.824782
3,LinearSVC,task13,0.906322,0.852914
4,LogisticRegression_impute,task14,0.904876,0.850866
5,KNeighborsClassifier_impute,task14,0.887343,0.825253
6,LinearSVC_impute,task14,0.905422,0.851255
7,LogisticRegression_delete_missings,task15,0.901149,0.846845
8,KNeighborsClassifier_delete_missings,task15,0.882951,0.820574
9,LinearSVC_delete_missings,task15,0.902403,0.848503


 16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

In [26]:
%%time
### Ваш код ###

with ThreadPoolExecutor(max_workers=2) as executor:
    rf_future = executor.submit(get_si_cv_f1_accuracy_score, RandomForestClassifier(), X, y)
    gb_future = executor.submit(get_si_cv_f1_accuracy_score, GradientBoostingClassifier(), X, y)

    rf_scores = rf_future.result()
    gb_scores = gb_future.result()


f1_RF = rf_scores[1]
acc_RF = rf_scores[0]
f1_GB = gb_scores[1]
acc_GB = gb_scores[0]
results_classification.loc[10] = ['RandomForestClassifier', 'task16', f1_RF, acc_RF]
results_classification.loc[11] = ['GradientBoostingClassifier', 'task16', f1_GB, acc_GB]


Wall time: 7min 50s


In [27]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.905,0.851153
2,KNeighborsClassifier,task13,0.886993,0.824782
3,LinearSVC,task13,0.906322,0.852914
4,LogisticRegression_impute,task14,0.904876,0.850866
5,KNeighborsClassifier_impute,task14,0.887343,0.825253
6,LinearSVC_impute,task14,0.905422,0.851255
7,LogisticRegression_delete_missings,task15,0.901149,0.846845
8,KNeighborsClassifier_delete_missings,task15,0.882951,0.820574
9,LinearSVC_delete_missings,task15,0.902403,0.848503


17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [20]:
%%time
### Ваш код ###

models = [
    LogisticRegression(), 
    KNeighborsClassifier(),
    LinearSVC(), 
    RandomForestClassifier(), 
    GradientBoostingClassifier()
]

encoders = [
    OneHotEncoder(handle_unknown='ignore'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
]

scalers = [
    StandardScaler(), 
    MinMaxScaler()
]

pipe_categ = Pipeline([  
                ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ])

pipe_number = Pipeline([
            ('scaler', MinMaxScaler())
        ])

transformer = ColumnTransformer([
    ('category_pipe', pipe_categ, category_columns),
    ('number_pipe', pipe_number, number_columns)
])

pipe_process = Pipeline([

    ('processor', transformer),
    ('model', LogisticRegression())
])


params = {
    'processor__category_pipe__imputer__strategy': ['most_frequent', 'mean', 'median'],
    'processor__number_pipe__scaler': [StandardScaler(), MinMaxScaler()],
    'processor__category_pipe__encoder': encoders,
    'model': models
}

grid = GridSearchCV(pipe_process, params)

grid.fit(X, y)

f1_score = cross_val_score(grid.best_estimator_, X, y, scoring='f1').mean()
accuracy_score = cross_val_score(grid.best_estimator_, X, y, scoring='accuracy').mean()

best_params = grid.best_params_

print('Параметры лучшей модели:\n', best_params)
f1_best = f1_score
acc_best = accuracy_score
results_classification.loc[12] = ['Best_Model', 'task17', f1_best, acc_best]

Параметры лучшей модели:
 {'model': GradientBoostingClassifier(), 'processor__category_pipe__encoder': OneHotEncoder(handle_unknown='ignore'), 'processor__category_pipe__imputer__strategy': 'most_frequent', 'processor__number_pipe__scaler': StandardScaler()}
CPU times: total: 30min 20s
Wall time: 23min 17s


In [21]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
12,Best_Model,task17,0.915502,0.86659


*по результату выбора лучшей модели классификации наилучшие результаты показал градиентный бустинг, значительно улучшив результат предсказания самого частого класса*