In [4]:
import pandas as pd
import numpy as np

In [5]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

In [6]:
data = pd.read_csv('boston.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
target_variable = data['MEDV'].to_numpy()
train_df = data.drop(columns=['MEDV'])

X_train, X_test, Y_train, Y_test = train_test_split(train_df.values, target_variable, test_size=0.2)

2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [9]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score

In [10]:
lin_regression = LinearRegression()
lasso = Lasso()
ridge = Ridge()

for model in [lin_regression, ridge, lasso]:
  model.fit(X_train, Y_train)

lr_pred = lin_regression.predict(X_test)
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)

r2_lr = r2_score(Y_test, lr_pred)
r2_ridge = r2_score(Y_test, ridge_pred)
r2_lasso = r2_score(Y_test, lasso_pred)
results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]

3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

In [12]:
# create list of all possible params
arr = np.arange(-5, 6, dtype='float64')
grid = 10 ** arr

def GridSearcher(estimator):
  gridCV = GridSearchCV(estimator, {'alpha': grid})
  gridCV.fit(train_df.values, target_variable)

  # let's make decision according to 'mean_test_score'
  res_dict = gridCV.cv_results_
  idx = res_dict['mean_test_score'].argmax()
  return grid[idx]

# we need raw models to estimate params
lasso = Lasso()
ridge = Ridge()

lasso_grid_alpha = GridSearcher(lasso)
ridge_grid_alpha = GridSearcher(ridge)

# other estimators
clf = RidgeCV(alphas=grid).fit(train_df.values, target_variable)
ridge_alpha = clf.score(train_df.values, target_variable)

clf = LassoCV(alphas=grid).fit(train_df.values, target_variable)
lasso_alpha = clf.score(train_df.values, target_variable)

# then we fit models with optimal params and compte r2
lasso = Lasso(alpha=lasso_grid_alpha)
ridge = Ridge(alpha=ridge_grid_alpha)
for model in [ridge, lasso]:
  model.fit(X_train, Y_train)

ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)

r2_ridge_grid_search = r2_score(Y_test, ridge_pred)
r2_lasso_grid_search = r2_score(Y_test, lasso_pred)

# the same with ridgeCV and lassoCV
lasso = Lasso(alpha=lasso_alpha)
ridge = Ridge(alpha=ridge_alpha)
for model in [ridge, lasso]:
  model.fit(X_train, Y_train)

ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)

r2_ridge_cv = r2_score(Y_test, ridge_pred)
r2_lasso_cv = r2_score(Y_test, lasso_pred)

results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]

In [13]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833


Т.о. результаты почти не изменились. Для LassoCV даже стали чуть лучшу

4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [14]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [15]:
lasso_standard = Pipeline([('scaler', StandardScaler()), ('lasso', Lasso())])
lasso_minmax = Pipeline([('scaler', MinMaxScaler()), ('lasso', Lasso())])

lasso_standard.fit(X_train, Y_train)
lasso_minmax.fit(X_train, Y_train)

y_pred = lasso_standard.predict(X_test)
r2_lasso_standart_scaler = r2_score(Y_test, y_pred)

y_pred = lasso_minmax.predict(X_test)
r2_lasso_min_max_scaler = r2_score(Y_test, y_pred)

# the same with Ridge()
ridge_standard = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())])
ridge_minmax = Pipeline([('scaler', MinMaxScaler()), ('ridge', Ridge())])

ridge_standard.fit(X_train, Y_train)
ridge_minmax.fit(X_train, Y_train)

y_pred = ridge_standard.predict(X_test)
r2_ridge_standart_scaler = r2_score(Y_test, y_pred)

y_pred = ridge_minmax.predict(X_test)
r2_ridge_min_max_scaler = r2_score(Y_test, y_pred)

results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]

In [16]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833
7,Ridge_StandardScaler,task4,0.707793
8,Ridge_MinMaxScaler,task4,0.703846
9,Lasso_StandardScaler,task4,0.635302


Для ridge результаты стали чуть лучше, для Lasso значительно лучше (по сравнению с ухуджением метрики r2 для ridge)

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [17]:
arr = np.arange(-5, 6, dtype='float64')
grid = 10 ** arr

scaler_std = StandardScaler().fit(train_df.values)
scaler_minmax = MinMaxScaler().fit(train_df.values)

X_train_std = scaler_std.transform(X_train)
X_train_minmax = scaler_minmax.transform(X_train)
X_test_std = scaler_std.transform(X_test)
X_test_minmax = scaler_minmax.transform(X_test)
train_df_std = scaler_std.transform(train_df.values)
train_df_minmax = scaler_minmax.transform(train_df.values)

# let's use searchers CV to find optimal params
ridge_grid_alpha_std = RidgeCV(alphas=grid).fit(train_df_std, target_variable).score(train_df_std, target_variable)
ridge_grid_alpha_minmax = RidgeCV(alphas=grid).fit(train_df_minmax, target_variable).score(train_df_minmax, target_variable)

lasso_grid_alpha_std = LassoCV(alphas=grid).fit(train_df_std, target_variable).score(train_df_std, target_variable)
lasso_grid_alpha_minmax = LassoCV(alphas=grid).fit(train_df_minmax, target_variable).score(train_df_minmax, target_variable)

lasso_std = Lasso(alpha=lasso_grid_alpha_std)
lasso_minmax = Lasso(alpha=lasso_grid_alpha_minmax)
ridge_std = Ridge(alpha=ridge_grid_alpha_std)
ridge_minmax = Ridge(alpha=ridge_grid_alpha_minmax)

for model in [ridge_std, lasso_std]:
  model.fit(X_train_std, Y_train)

for model in [ridge_minmax, lasso_minmax]:
  model.fit(X_train_minmax, Y_train)

# finally estimate our models

ridge_pred_std = ridge_std.predict(X_test_std)
ridge_pred_minmax = ridge_minmax.predict(X_test_minmax)
lasso_pred_std = lasso_std.predict(X_test_std)
lasso_pred_minmax = lasso_minmax.predict(X_test_minmax)

r2_ridge_standart_scaler_cv = r2_score(Y_test, ridge_pred_std)
r2_ridge_min_max_scaler_cv =  r2_score(Y_test, ridge_pred_minmax)
r2_lasso_standart_scaler_cv =  r2_score(Y_test, lasso_pred_std)
r2_lasso_min_max_scaler_cv =  r2_score(Y_test, lasso_pred_minmax)
results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]

In [18]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833
7,Ridge_StandardScaler,task4,0.707793
8,Ridge_MinMaxScaler,task4,0.703846
9,Lasso_StandardScaler,task4,0.635302


Интересные результаты. Если для ridge результаты почти не изменились, то для lasso стали значительно хуже

6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [19]:
from sklearn.preprocessing import PolynomialFeatures

In [20]:
poly_std = PolynomialFeatures(2)
poly_minmax = PolynomialFeatures(2)

poly_std.fit(train_df_std)
poly_minmax.fit(train_df_minmax)

X_train_poly_std = poly_std.transform(X_train_std)
X_train_poly_minmax = poly_minmax.transform(X_train_minmax)
X_test_poly_std = poly_std.transform(X_test_std)
X_test_poly_minmax = poly_minmax.transform(X_test_minmax)

lasso_std = Lasso()
lasso_minmax = Lasso()
ridge_std = Ridge()
ridge_minmax = Ridge()

for model in [ridge_std, lasso_std]:
  model.fit(X_train_poly_std, Y_train)

for model in [ridge_minmax, lasso_minmax]:
  model.fit(X_train_poly_minmax, Y_train)

# finally estimate our models

ridge_pred_std = ridge_std.predict(X_test_poly_std)
ridge_pred_minmax = ridge_minmax.predict(X_test_poly_minmax)
lasso_pred_std = lasso_std.predict(X_test_poly_std)
lasso_pred_minmax = lasso_minmax.predict(X_test_poly_minmax)

r2_ridge_standart_scaler_poly_cv = 0
r2_ridge_min_max_scaler_poly_cv = 0
r2_lasso_standart_scaler_poly_cv = 0
r2_lasso_min_max_scaler_poly_cv = 0

r2_ridge_standart_scaler_poly = r2_score(Y_test, ridge_pred_std)
r2_ridge_min_max_scaler_poly = r2_score(Y_test, ridge_pred_minmax)
r2_lasso_standart_scaler_poly = r2_score(Y_test, lasso_pred_std)
r2_lasso_min_max_scaler_poly = r2_score(Y_test, lasso_pred_minmax)

results_regression.loc[15] = ['Ridge_StandardScaler_Poly', 'task6', r2_ridge_standart_scaler_poly]
results_regression.loc[16] = ['Ridge_MinMaxScaler_Poly', 'task6', r2_ridge_min_max_scaler_poly]
results_regression.loc[17] = ['Lasso_StandardScaler_Poly', 'task6', r2_lasso_standart_scaler_poly]
results_regression.loc[18] = ['Lasso_MinMaxScaler_Poly', 'task6', r2_lasso_min_max_scaler_poly]

In [21]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833
7,Ridge_StandardScaler,task4,0.707793
8,Ridge_MinMaxScaler,task4,0.703846
9,Lasso_StandardScaler,task4,0.635302


Метрики очень сильно выросли. За исключением Lasso для min-max scale.

7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [22]:
poly_std = PolynomialFeatures(2)
poly_minmax = PolynomialFeatures(2)

poly_std.fit(train_df_std)
poly_minmax.fit(train_df_minmax)

X_train_poly_std = poly_std.transform(X_train_std)
X_train_poly_minmax = poly_minmax.transform(X_train_minmax)
X_test_poly_std = poly_std.transform(X_test_std)
X_test_poly_minmax = poly_minmax.transform(X_test_minmax)
train_df_poly_std = poly_std.transform(train_df_std)
train_df_poly_minmax = poly_std.transform(train_df_minmax)

# let's use searchers CV to find optimal params
ridge_grid_alpha_poly_std = RidgeCV(alphas=grid).fit(train_df_poly_std, target_variable).score(train_df_poly_std, target_variable)
ridge_grid_alpha_poly_minmax = RidgeCV(alphas=grid).fit(train_df_poly_minmax, target_variable).score(train_df_poly_minmax, target_variable)

lasso_grid_alpha_poly_std = LassoCV(alphas=grid).fit(train_df_poly_std, target_variable).score(train_df_poly_std, target_variable)
lasso_grid_alpha_poly_minmax = LassoCV(alphas=grid).fit(train_df_poly_minmax, target_variable).score(train_df_poly_minmax, target_variable)

lasso_std = Lasso(alpha=lasso_grid_alpha_poly_std)
lasso_minmax = Lasso(alpha=lasso_grid_alpha_poly_minmax)
ridge_std = Ridge(alpha=ridge_grid_alpha_poly_std)
ridge_minmax = Ridge(alpha=ridge_grid_alpha_poly_minmax)

for model in [ridge_std, lasso_std]:
  model.fit(X_train_poly_std, Y_train)

for model in [ridge_minmax, lasso_minmax]:
  model.fit(X_train_poly_minmax, Y_train)

# finally estimate our models

ridge_pred_std = ridge_std.predict(X_test_poly_std)
ridge_pred_minmax = ridge_minmax.predict(X_test_poly_minmax)
lasso_pred_std = lasso_std.predict(X_test_poly_std)
lasso_pred_minmax = lasso_minmax.predict(X_test_poly_minmax)

r2_ridge_standart_scaler_poly_cv = r2_score(Y_test, ridge_pred_std)
r2_ridge_min_max_scaler_poly_cv = r2_score(Y_test, ridge_pred_minmax)
r2_lasso_standart_scaler_poly_cv = r2_score(Y_test, lasso_pred_std)
r2_lasso_min_max_scaler_poly_cv = r2_score(Y_test, lasso_pred_minmax)

results_regression.loc[19] = ['Ridge_StandardScaler_Poly_CV', 'task7', r2_ridge_standart_scaler_poly_cv]
results_regression.loc[20] = ['Ridge_MinMaxScaler_Poly_CV', 'task7', r2_ridge_min_max_scaler_poly_cv]
results_regression.loc[21] = ['Lasso_StandardScaler_Poly_CV', 'task7', r2_lasso_standart_scaler_poly_cv]
results_regression.loc[22] = ['Lasso_MinMaxScaler_Poly_CV', 'task7', r2_lasso_min_max_scaler_poly_cv]

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [23]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833
7,Ridge_StandardScaler,task4,0.707793
8,Ridge_MinMaxScaler,task4,0.703846
9,Lasso_StandardScaler,task4,0.635302


Снова видим, что параметры незначительно изменили качество модели. Зато у Lasso на MinMax результаты получше, но всё равно такую модель лучше не использовать

8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

Из прошлых экспериментов ивдно, что Ridge явно лучше, чем Lasso на нших данных. Чтобы не делать линшнюю работу - будем рассматривать только эту модель и подбирать признаки для неё. Эта модель использует l2-регуляризацию. Таким образом нам просто осталось проверить степень полинома PolyTransform с которым достигается лучшая метрика r2

In [24]:
best_params = {}

In [25]:
def poly_transform(n, raw_data_std, raw_data_minmax):
  poly_std = PolynomialFeatures(n)
  poly_minmax = PolynomialFeatures(n)
  poly_std.fit(train_df_std)
  poly_minmax.fit(train_df_minmax)
  return poly_std.transform(raw_data_std), poly_minmax.transform(raw_data_minmax)

degrees = np.arange(2, 5)
r2_res_std = []
r2_res_minmax = []
alphas_std = []
alphas_minmax = []

for i, deg in enumerate(degrees):
  X_train_poly_std, X_train_poly_minmax = poly_transform(i, X_train_std, X_train_minmax)
  X_test_poly_std, X_test_poly_minmax = poly_transform(i, X_test_std, X_test_minmax)
  train_df_poly_std, train_df_poly_minmax = poly_transform(i, train_df_std, train_df_minmax)

  ridge_grid_alpha_poly_std = RidgeCV(alphas=grid).fit(train_df_poly_std, target_variable).score(train_df_poly_std, target_variable)
  ridge_grid_alpha_poly_minmax = RidgeCV(alphas=grid).fit(train_df_poly_minmax, target_variable).score(train_df_poly_minmax, target_variable)

  ridge_std = Ridge(alpha=ridge_grid_alpha_poly_std)
  ridge_minmax = Ridge(alpha=ridge_grid_alpha_poly_minmax)
  alphas_std.append(ridge_grid_alpha_poly_std)
  alphas_minmax.append(ridge_grid_alpha_poly_minmax)

  for model in [ridge_std, lasso_std]:
    model.fit(X_train_poly_std, Y_train)

  for model in [ridge_minmax, lasso_minmax]:
    model.fit(X_train_poly_minmax, Y_train)


  ridge_pred_std = ridge_std.predict(X_test_poly_std)
  ridge_pred_minmax = ridge_minmax.predict(X_test_poly_minmax)

  r2_res_std.append(r2_score(Y_test, ridge_pred_std))
  r2_res_minmax.append(r2_score(Y_test, ridge_pred_minmax))

r2_res_std = np.array(r2_res_std) 
r2_res_minmax = np.array(r2_res_minmax)

idx_std = r2_res_std.argmax()
idx_minmax = r2_res_minmax.argmax()

best_params['scaler'] = 'MinMaxScaler'
best_params['TestR2'] = r2_res_minmax[idx_minmax]
true_idx = idx_minmax
best_params['alpha'] = alphas_minmax[true_idx]
if r2_res_std[idx_std] > r2_res_minmax[idx_minmax]:
  best_params['scaler'] = 'StandardScaler'
  best_params['TestR2'] = r2_res_std[idx_std]
  true_idx = idx_std
  best_params['alpha'] = alphas_std[true_idx]

best_params['PolyDegree'] = degrees[true_idx]
print('Параметры лучшей модели:\n', best_params)
r2_best_model = best_params['TestR2']
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]

Параметры лучшей модели:
 {'scaler': 'MinMaxScaler', 'TestR2': 0.8343362672417425, 'alpha': 0.9224825946440818, 'PolyDegree': 4}


In [26]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.708127
1,Ridge,task2,0.699761
2,Lasso,task2,0.663423
3,Ridge_GridSearchCV,task3,0.685485
4,RidgeCV,task3,0.701217
5,Lasso_GridSearchCV,task3,0.663423
6,LassoCV,task3,0.674833
7,Ridge_StandardScaler,task4,0.707793
8,Ridge_MinMaxScaler,task4,0.703846
9,Lasso_StandardScaler,task4,0.635302


http://archive.ics.uci.edu/ml/datasets/Adult

In [27]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [28]:
target = data['class'].replace({'<=50K': 1, '>50K': 0})
df = data.drop(columns='class')

10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [29]:
from sklearn.metrics import f1_score, accuracy_score

In [30]:
idx = target.value_counts().argmax()

In [31]:
label = target.value_counts().keys()[idx]

In [32]:
prediction = label*np.ones(df.shape[0])

f1_most_frequent = f1_score(target, prediction)
acc_most_frequent = accuracy_score(target, prediction)
results_classification.loc[0] = ['Most Frequent class', 'task10', f1_most_frequent, acc_most_frequent]

In [33]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718


11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [34]:
df.isna().mean()

age               0.0
workclass         0.0
fnlwgt            0.0
education         0.0
education-num     0.0
marital-status    0.0
occupation        0.0
relationship      0.0
race              0.0
sex               0.0
capital-gain      0.0
capital-loss      0.0
hours-per-week    0.0
native-country    0.0
dtype: float64

Пропусков нет


12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).

In [35]:
numeric_cols= list(df.select_dtypes(include='number').columns)
object_cols = list(df.select_dtypes(include='object').columns)

13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [36]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC 
from sklearn.neighbors import KNeighborsClassifier

Применяем min-max scale

In [38]:
tmp_df = df.copy()

In [39]:
scaler = MinMaxScaler()
scaler.fit(tmp_df[numeric_cols])
tmp_df[numeric_cols] = scaler.transform(tmp_df[numeric_cols])

Потом one-hot encoder

In [40]:
enc = OneHotEncoder(handle_unknown='ignore')

In [41]:
enc.fit(tmp_df[object_cols])
res = enc.transform(tmp_df[object_cols]).toarray()

In [42]:
columns = list(enc.get_feature_names_out())

Добавляем колонки и дропаем прошлые

In [43]:
tmp_df = tmp_df.drop(columns=object_cols)

In [44]:
tmp_df = pd.concat([tmp_df, pd.DataFrame(data=res, columns=columns)], axis=1)

Уберем ворнинги

In [45]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [46]:
f1_LR = cross_val_score(LogisticRegression(), tmp_df, target, scoring='f1', cv=2).mean()
acc_LR = cross_val_score(LogisticRegression(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_KNN = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='f1', cv=2).mean()
acc_KNN = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_SVM = cross_val_score(LinearSVC(), tmp_df, target, scoring='f1', cv=2).mean()
acc_SVM = cross_val_score(LinearSVC(), tmp_df, target, scoring='f1', cv=2).mean()
results_classification.loc[1] = ['LogisticRegression', 'task13', f1_LR, acc_LR]
results_classification.loc[2] = ['KNeighborsClassifier', 'task13', f1_KNN, acc_KNN]
results_classification.loc[3] = ['LinearSVC', 'task13', f1_SVM, acc_SVM]

In [47]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.90445,0.85017
2,KNeighborsClassifier,task13,0.887447,0.825335
3,LinearSVC,task13,0.905537,0.905537


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [69]:
tmp_df = df.copy()

Теперь по колонкам заполним значения "?"

In [70]:
from sklearn.impute import SimpleImputer

In [71]:
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
imp.fit(tmp_df.values)

In [72]:
res = imp.transform(tmp_df.values)

In [73]:
tmp_df = pd.DataFrame(res, columns=tmp_df.columns)

In [74]:
tmp_df_most_freq = tmp_df.copy()

Теперь можно использовать MinMaxScale и OneHotEncoding

In [75]:
scaler = MinMaxScaler()
scaler.fit(tmp_df[numeric_cols])
tmp_df[numeric_cols] = scaler.transform(tmp_df[numeric_cols])

In [76]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(tmp_df[object_cols])
res = enc.transform(tmp_df[object_cols]).toarray()
columns = list(enc.get_feature_names_out())
tmp_df = tmp_df.drop(columns=object_cols)
tmp_df = pd.concat([tmp_df, pd.DataFrame(data=res, columns=columns)], axis=1)

In [77]:
tmp_df_scaled = tmp_df.copy()

In [56]:
f1_LR = cross_val_score(LogisticRegression(), tmp_df, target, scoring='f1', cv=2).mean()
acc_LR = cross_val_score(LogisticRegression(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_KNN = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='f1', cv=2).mean()
acc_KNN = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_SVM = cross_val_score(LinearSVC(), tmp_df, target, scoring='f1', cv=2).mean()
results_classification.loc[4] = ['LogisticRegression_impute', 'task14', f1_LR, acc_LR]
results_classification.loc[5] = ['KNeighborsClassifier_impute', 'task14', f1_KNN, acc_KNN]
results_classification.loc[6] = ['LinearSVC_impute', 'task14', f1_SVM, acc_SVM]

In [57]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.90445,0.85017
2,KNeighborsClassifier,task13,0.887447,0.825335
3,LinearSVC,task13,0.905537,0.905537
4,LogisticRegression_impute,task14,0.904273,0.849781
5,KNeighborsClassifier_impute,task14,0.887144,0.824905
6,LinearSVC_impute,task14,0.905184,0.905537


15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

Здесь проще начать с самого начала и убрать все строчки, содержащие ?

In [58]:
row_idxs_to_remove = []
for col in object_cols:
  for val in data.index[data[col] == '?'].tolist():
    row_idxs_to_remove.append(val)

Убираем повторения

In [59]:
row_idxs_to_remove = list(set(row_idxs_to_remove))

Новый датафрейм

In [60]:
new_data = data.drop(index=row_idxs_to_remove)

In [61]:
target = new_data['class'].replace({'<=50K': 1, '>50K': 0})
tmp_df = new_data.drop(columns='class')

Теперь добавим minmaxscaler и one-hot-encoder

In [62]:
scaler = MinMaxScaler()
scaler.fit(tmp_df[numeric_cols])
tmp_df[numeric_cols] = scaler.transform(tmp_df[numeric_cols])

In [63]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(tmp_df[object_cols])
res = enc.transform(tmp_df[object_cols]).toarray()

columns = list(enc.get_feature_names_out())
tmp_df = tmp_df.drop(columns=object_cols)
tmp_df = pd.concat([tmp_df.reset_index(drop=True), pd.DataFrame(data=res, columns=columns)], axis=1, ignore_index=True)

In [64]:
f1_LR_del_missings = cross_val_score(LogisticRegression(), tmp_df, target, scoring='f1', cv=2).mean()
acc_LR_del_missings = cross_val_score(LogisticRegression(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_KNN_del_missings = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='f1', cv=2).mean()
acc_KNN_del_missings = cross_val_score(KNeighborsClassifier(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_SVM_del_missings = cross_val_score(LinearSVC(), tmp_df, target, scoring='f1', cv=2).mean()
acc_SVM_del_missings = cross_val_score(LinearSVC(), tmp_df, target, scoring='accuracy', cv=2).mean()
results_classification.loc[7] = ['LogisticRegression_delete_missings', 'task15', f1_LR_del_missings, acc_LR_del_missings]
results_classification.loc[8] = ['KNeighborsClassifier_delete_missings', 'task15', f1_KNN_del_missings, acc_KNN_del_missings]
results_classification.loc[9] = ['LinearSVC_delete_missings', 'task15', f1_SVM_del_missings, acc_SVM_del_missings]

 16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

In [65]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [67]:
target = data['class'].replace({'<=50K': 1, '>50K': 0})

In [79]:
f1_RF = cross_val_score(RandomForestClassifier(), tmp_df_scaled, target, scoring='f1', cv=2).mean()
acc_RF = cross_val_score(RandomForestClassifier(), tmp_df_scaled, target, scoring='accuracy', cv=2).mean()
f1_GB = cross_val_score(GradientBoostingClassifier(), tmp_df_scaled, target, scoring='f1', cv=2).mean()
acc_GB = cross_val_score(GradientBoostingClassifier(), tmp_df_scaled, target, scoring='accuracy', cv=2).mean()
results_classification.loc[10] = ['RandomForestClassifier', 'task16', f1_RF, acc_RF]
results_classification.loc[11] = ['GradientBoostingClassifier', 'task16', f1_GB, acc_GB]

In [80]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.90445,0.85017
2,KNeighborsClassifier,task13,0.887447,0.825335
3,LinearSVC,task13,0.905537,0.905537
4,LogisticRegression_impute,task14,0.904273,0.849781
5,KNeighborsClassifier_impute,task14,0.887144,0.824905
6,LinearSVC_impute,task14,0.905184,0.905537
7,LogisticRegression_delete_missings,task15,0.900525,0.845783
8,KNeighborsClassifier_delete_missings,task15,0.883032,0.820596
9,LinearSVC_delete_missings,task15,0.901964,0.847795


17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

Опыт прошлых экспериментов показал, что по метрика accuracy и f1_score лучше всего справляется модель LinearSVC() на данных с most-frequent imputing

Попробуем применить StandartScale помимо min-max-a

In [89]:
results_classification[results_classification.model == 'LinearSVC_impute'].iloc[:, 2:4]

Unnamed: 0,f1,accuracy
6,0.905184,0.905537


Помимо most-frequent imputer-а можно было использовать EndTailImputer чтобы подчеркнуть отсутствие данных по этим признакам

In [87]:
tmp_df = tmp_df_most_freq.copy()

In [90]:
scaler = StandardScaler()
scaler.fit(tmp_df[numeric_cols])
tmp_df[numeric_cols] = scaler.transform(tmp_df[numeric_cols])

In [91]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(tmp_df[object_cols])
res = enc.transform(tmp_df[object_cols]).toarray()
columns = list(enc.get_feature_names_out())
tmp_df = tmp_df.drop(columns=object_cols)
tmp_df = pd.concat([tmp_df, pd.DataFrame(data=res, columns=columns)], axis=1)

In [92]:
best_params = {}

In [99]:
f1_SVM = cross_val_score(LinearSVC(), tmp_df, target, scoring='f1', cv=2).mean()
acc_SVM = cross_val_score(LinearSVC(), tmp_df, target, scoring='accuracy', cv=2).mean()
f1_best = 0
acc_best = 0

best_params['Imputer'] = 'MostFrequent'
best_params['Scale'] = 'MinMaxScaler'
mean_best_svm = results_classification[results_classification.model == 'LinearSVC_impute'].iloc[:, 2:4].to_numpy().mean()
if (f1_SVM + acc_SVM)/2 > (mean_best_svm):
  best_params['Scale'] = 'StandardScaler'
  f1_best = f1_SVM 
  acc_best = acc_SVM
else:
  f1_best, acc_best = list(results_classification[results_classification.model == 'LinearSVC_impute'].iloc[:, 2:4].to_numpy()[0])

print('Параметры лучшей модели:\n', best_params)
results_classification.loc[12] = ['Best_Model', 'task17', f1_best, acc_best]

Параметры лучшей модели:
 {'Imputer': 'MostFrequent', 'Scale': 'MinMaxScaler'}


In [101]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.90445,0.85017
2,KNeighborsClassifier,task13,0.887447,0.825335
3,LinearSVC,task13,0.905537,0.905537
4,LogisticRegression_impute,task14,0.904273,0.849781
5,KNeighborsClassifier_impute,task14,0.887144,0.824905
6,LinearSVC_impute,task14,0.905184,0.905537
7,LogisticRegression_delete_missings,task15,0.900525,0.845783
8,KNeighborsClassifier_delete_missings,task15,0.883032,0.820596
9,LinearSVC_delete_missings,task15,0.901964,0.847795
