In [189]:
import warnings

warnings.filterwarnings('ignore')

from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

In [190]:
RANDOM_STATE = 42

In [191]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)

In [192]:
X.columns = dataset.feature_names
y = dataset.target

1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [193]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
print(f'X_train size: {X_train.shape}\nX_test size: {X_test.shape}\n')

X_train size: (404, 13)
X_test size: (102, 13)



2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [194]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

models = [
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso())
]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print(f'r2 score for {name}: {score}')

r2 score for LinearRegression: 0.6687594935356307
r2 score for Ridge: 0.666222167016852
r2 score for Lasso: 0.6671453631686304


3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [195]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

alphas = [pow(10, i) for i in range(-5, 6)]
CV = 10

models = [
    ('GridSearchCV Ridge', Ridge()),
    ('GridSearchCV Lasso', Lasso())
]

for name, model in models:
    model.fit(X_train, y_train)
    grid_search = GridSearchCV(estimator=model,
                               param_grid=[{'alpha': alphas}],
                               scoring='r2',
                               cv=CV,
                               )
    grid_search.fit(X_train, y_train)
    score = grid_search.score(X_test, y_test)
    print(f"{name}: alpha_opt = {grid_search.best_params_['alpha']}, r2 score = {score}")

models_cv = [('RidgeCV', RidgeCV(alphas=alphas, cv=CV)),
             ('LassoCV', LassoCV(alphas=alphas, cv=CV))]

for name, model in models_cv:
    model.fit(X_train, y_train)
    score = r2_score(y_test, model.predict(X_test))
    print(f'{name}: alpha_opt = {model.alpha_}, r2 score = {score}')

GridSearchCV Ridge: alpha_opt = 1e-05, r2 score = 0.6687594856409733
GridSearchCV Lasso: alpha_opt = 1e-05, r2 score = 0.6687598638315153
RidgeCV: alpha_opt = 1e-05, r2 score = 0.6687594856409733
LassoCV: alpha_opt = 1e-05, r2 score = 0.6687598638315153


Вывод: результат немного улучшился

4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [196]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

scalers = [('MinMax', MinMaxScaler()),
           ('Standard', StandardScaler())]
models = [('Ridge', Ridge()),
          ('Lasso', Lasso())]

for scaler_name, scaler in scalers:
    for model_name, model in models:
        pipe = Pipeline([('scaler', scaler), ('model', model)])
        pipe.fit(X_train, y_train)
        score = pipe.score(X_test, y_test)
        print(f"{scaler_name} + {model_name}:  r2 score = {score}")

MinMax + Ridge:  r2 score = 0.6764100365423598
MinMax + Lasso:  r2 score = 0.2573921442545195
Standard + Ridge:  r2 score = 0.6684624359643558
Standard + Lasso:  r2 score = 0.6239428734251422


Вывод: результат улучшился для MinMax + Ridge. Для остальных комбинации улучшений не видно, даже видно ухудшение для MinMax + Lasso

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [197]:
scalers = [('MinMax', MinMaxScaler()),
           ('Standard', StandardScaler())]
models = [('Ridge', Ridge()),
          ('Lasso', Lasso())]

for scaler_name, scaler in scalers:
    for model_name, model in models:
        pipe = Pipeline([('scaler', scaler), ('model', model)])
        grid_search = GridSearchCV(pipe, param_grid=[{'model__alpha': alphas}], scoring='r2', cv=CV)
        grid_search.fit(X_train, y_train)
        score = grid_search.score(X_test, y_test)
        print(
            f"{scaler_name} + {model_name}: alpha_opt = {grid_search.best_params_['model__alpha']}, r2 score = {score}")

MinMax + Ridge: alpha_opt = 1, r2 score = 0.6764100365423598
MinMax + Lasso: alpha_opt = 0.01, r2 score = 0.6676993404117642
Standard + Ridge: alpha_opt = 10, r2 score = 0.6659677905050342
Standard + Lasso: alpha_opt = 0.01, r2 score = 0.6681815922762606


Вывод: результат улучшился для MinMax + Ridge. Для остальных комбинации улучшений не видно.

6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [198]:
from sklearn.preprocessing import PolynomialFeatures

scalers = [('MinMax', MinMaxScaler()),
           ('Standard', StandardScaler())]
models = [('Ridge', Ridge()),
          ('Lasso', Lasso())]
polyfeatures = [('feature mults without feature squares', PolynomialFeatures(degree=2, interaction_only=True)),
                ('feature mults + feature squares', PolynomialFeatures(degree=2))]

for scaler_name, scaler in scalers:
    for model_name, model in models:
        for polyf_name, polyfeature in polyfeatures:
            pipe = Pipeline([('scaler', scaler), ('poly', polyfeature), ('model', model)])
            grid_search = GridSearchCV(pipe, param_grid=[{'model__alpha': alphas}], scoring='r2', cv=CV)
            grid_search.fit(X_train, y_train)
            score = grid_search.score(X_test, y_test)
            print(
                f"{scaler_name} + {model_name} + {polyf_name}: alpha_opt = {grid_search.best_params_['model__alpha']}, r2 score = {score}")

MinMax + Ridge + feature mults without feature squares: alpha_opt = 0.01, r2 score = 0.8572252837848734
MinMax + Ridge + feature mults + feature squares: alpha_opt = 0.01, r2 score = 0.833535855503777
MinMax + Lasso + feature mults without feature squares: alpha_opt = 0.0001, r2 score = 0.840169780674503
MinMax + Lasso + feature mults + feature squares: alpha_opt = 0.001, r2 score = 0.8390581680518306
Standard + Ridge + feature mults without feature squares: alpha_opt = 10, r2 score = 0.8496468217328311
Standard + Ridge + feature mults + feature squares: alpha_opt = 10, r2 score = 0.818046587724366
Standard + Lasso + feature mults without feature squares: alpha_opt = 0.01, r2 score = 0.8509794967776149
Standard + Lasso + feature mults + feature squares: alpha_opt = 0.01, r2 score = 0.8138518691835619


Вывод: результат существенно улучшился для всех комбинаций

7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [199]:
param_grid = dict(scaler=[None, MinMaxScaler(), StandardScaler()],
                  poly=[None, PolynomialFeatures(degree=2, interaction_only=True), PolynomialFeatures(degree=2)],
                  model=[Ridge(), Lasso()],
                  model__alpha=alphas)

pipe = Pipeline([('scaler', 'passthrough'), ('poly', 'passthrough'), ('model', 'passthrough')])
grid_search = GridSearchCV(pipe, param_grid, scoring='r2', cv=CV)
grid_search.fit(X_train, y_train)
score = grid_search.score(X_test, y_test)
print(f"opt_scaler: {grid_search.best_params_['scaler']}\n"
      f"opt_poly:{grid_search.best_params_['poly']}\n"
      f"opt_model:{grid_search.best_params_['model']}\n"
      f"opt_score:{score}"
      )

opt_scaler: StandardScaler()
opt_poly:PolynomialFeatures(interaction_only=True)
opt_model:Ridge(alpha=10)
opt_score:0.8496468217328311


http://archive.ics.uci.edu/ml/datasets/Adult

In [200]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [201]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [None]:
df = data.copy()
N = len(df.columns)
X = df.iloc[:, :N - 1]
y = df.iloc[:, -1].apply(lambda x: 1.0 if x == '>50K' else 0.0)

9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [203]:
df.shape == df.dropna().shape

True

Вывод: пропуски отсутствуют

10. Выберите колонки с числовыми и категориальными переменными.

In [204]:
from pandas.api.types import is_string_dtype

max_n_unique = 0.000005 * df.shape[0]
cat, non_cat = [], []
for col in X.columns:
    if df[col].nunique() < max_n_unique or is_string_dtype(df[col]):
        cat.append(col)
    else:
        non_cat.append(col)

print(f'categorical:{cat},\nnon-categorical:{non_cat}')

categorical:[1, 3, 5, 6, 7, 8, 9, 13],
non-categorical:[0, 2, 4, 10, 11, 12]


11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [205]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

t = [('num', MinMaxScaler(), non_cat), ('cat', OneHotEncoder(), cat)]
transformer = ColumnTransformer(transformers=t)
X_train1_sparse = transformer.fit_transform(X.copy())
X1 = pd.DataFrame(X_train1_sparse.toarray())
X1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.30137,0.044131,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048052,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.137581,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.493151,0.150486,0.4,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.150685,0.220635,0.8,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [206]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.2, random_state=RANDOM_STATE)
most_freq_val = y.value_counts().idxmax()
X1_most_freq_train = X1.loc[y == most_freq_val]
y1_most_freq = y.loc[y == most_freq_val]

log_reg = LogisticRegression()
log_reg.fit(X1_train, y1_train)
y1_pred = log_reg.predict(X1_most_freq_train)
accuracy = accuracy_score(y1_most_freq, y1_pred)
f1_score_w = f1_score(y1_most_freq, y1_pred, average='weighted')
f1_score_b = f1_score(y1_most_freq, y1_pred, average='binary')
f1_score_m = f1_score(y1_most_freq, y1_pred, average='micro')
print(
    f'accuracy: {accuracy}\nf1 score(weighted):{f1_score_w}\nf1 score(binary):{f1_score_b}\nf1 score(micro):{f1_score_m}')

accuracy: 0.9335486475575293
f1 score(weighted):0.9656324383012485
f1 score(binary):0.0
f1 score(micro):0.9335486475575293


13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [118]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC, LinearSVC

models = [LogisticRegression(), SVC(), LinearSVC()]
metrics = ['accuracy', 'f1', 'f1_weighted']

for model in models:
    for metric in metrics:
        score = cross_val_score(model, X1, y, cv=CV, scoring=metric).mean()
        print(f'model: {model}, metric: {metric}, score: {score}')

model: LogisticRegression(), metric: accuracy, score: 0.8512346961272244
model: LogisticRegression(), metric: f1, score: 0.6565786295140361
model: LogisticRegression(), metric: f1_weighted, score: 0.8455935113447877
model: SVC(), metric: accuracy, score: 0.8410180549023947
model: SVC(), metric: f1, score: 0.6225568402859734
model: SVC(), metric: f1_weighted, score: 0.8330788126531405
model: LinearSVC(), metric: accuracy, score: 0.852667876306566
model: LinearSVC(), metric: f1, score: 0.6571268426576358
model: LinearSVC(), metric: f1_weighted, score: 0.8465790827177206


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [207]:
from sklearn.impute import SimpleImputer

inputer = SimpleImputer(missing_values='?', strategy='most_frequent')
inputer = inputer.fit_transform(X.copy())
X_cleared = pd.DataFrame(inputer)
X_cleared.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [208]:
X_cleared_sparse = transformer.fit_transform(X_cleared.copy())
X2 = pd.DataFrame(X_cleared_sparse.toarray())

models = [LogisticRegression(), LinearSVC()]
metrics = ['accuracy', 'f1', 'f1_weighted']

for model in models:
    for metric in metrics:
        score = cross_val_score(model, X2, y, cv=CV, scoring=metric).mean()
        print(f'model: {model}, metric: {metric}, score: {score}')

model: LogisticRegression(), metric: accuracy, score: 0.8506613578312656
model: LogisticRegression(), metric: f1, score: 0.6538921995821159
model: LogisticRegression(), metric: f1_weighted, score: 0.844751954713187
model: LinearSVC(), metric: accuracy, score: 0.8511322874935976
model: LinearSVC(), metric: f1, score: 0.6517974174872275
model: LinearSVC(), metric: f1_weighted, score: 0.8446589797120246


Вывод: результат немного ухудшился

16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [209]:
X_dropped = df.copy().replace('?', np.nan).dropna()
X1_dropped = X_dropped.iloc[:, :N - 1]
y = X_dropped.iloc[:, -1].apply(lambda x: 1.0 if x == '>50K' else 0.0)

X1_dropped_sparse = transformer.fit_transform(X1_dropped)
X2_dropped = pd.DataFrame(X1_dropped_sparse.toarray())

for model in models:
    for metric in metrics:
        score = cross_val_score(model, X2_dropped, y, cv=CV, scoring=metric).mean()
        print(f'model: {model}, metric: {metric}, score: {score}')

model: LogisticRegression(), metric: accuracy, score: 0.8471099407099377
model: LogisticRegression(), metric: f1, score: 0.6604622598607446
model: LogisticRegression(), metric: f1_weighted, score: 0.8416385190841975
model: LinearSVC(), metric: accuracy, score: 0.8483703520157381
model: LinearSVC(), metric: f1, score: 0.6613169761301343
model: LinearSVC(), metric: f1_weighted, score: 0.8425843851728692


Вывод: результат почти не поменялся, немного ухудшился

 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [133]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = [RandomForestClassifier(), GradientBoostingClassifier()]
for model in models:
    for metric in metrics:
        score = cross_val_score(model, X2_dropped, y, cv=CV, scoring=metric).mean()
        print(f'model: {model}, metric: {metric}, score: {score}')

model: RandomForestClassifier(), metric: accuracy, score: 0.849829785411494
model: RandomForestClassifier(), metric: f1, score: 0.6732926772674385
model: RandomForestClassifier(), metric: f1_weighted, score: 0.8451934096014349
model: GradientBoostingClassifier(), metric: accuracy, score: 0.8630314878898486
model: GradientBoostingClassifier(), metric: f1, score: 0.6880056062412362
model: GradientBoostingClassifier(), metric: f1_weighted, score: 0.8566734586197544


Вывод: по всем метрикам GradientBoostingClassifier превосходит остальные модели

18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [162]:
def clearData(data, opt):
    N = len(data.columns)
    df = data.copy()
    if opt == 'clear':
        inputer = SimpleImputer(missing_values='?', strategy='most_frequent')
        X = df.iloc[:, :N - 1]
        X_to_split = pd.DataFrame(inputer.fit_transform(X))
        y = df.iloc[:, -1].apply(lambda x: 1.0 if x == '>50K' else 0.0)
    else:
        X_dr = df.copy().replace('?', np.nan).dropna()
        X_to_split = X_dr.iloc[:, :N - 1]
        y = X_dr.iloc[:, -1].apply(lambda x: 1.0 if x == '>50K' else 0.0)

    return X_to_split, y


t = [('cat', OneHotEncoder(), cat)]
scalers = [MinMaxScaler(), StandardScaler()]
models = [LogisticRegression(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]
param_grid = dict(scaler=scalers, model=models)
pipe = Pipeline([('scaler', 'passthrough'), ('model', 'passthrough')])
opts = ['clear', 'drop']
scoring = ['accuracy', 'f1']
opt_list = []

for opt in opts:
    X, y = clearData(df, opt)
    X_sparse = ColumnTransformer(transformers=t, remainder="passthrough").fit_transform(X.copy())
    X1 = pd.DataFrame(X_sparse.toarray())
    X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=RANDOM_STATE)

    grid_search = GridSearchCV(pipe, param_grid, scoring='f1_weighted', cv=CV)
    grid_search.fit(X_train, y_train)
    score = grid_search.score(X_test, y_test)

    opt_list.append({'opt': opt, 'score': score, 'param': grid_search.best_params_})
    print(f"opt_scaler: {grid_search.best_params_['scaler']}\n"
          f"opt_model:{grid_search.best_params_['model']}\n"
          f"opt_score:{score}"
          )
opt_params = opt_list[0] if opt_list[0]['score'] > opt_list[1]['score'] else opt_list[1]
print(opt_params)

opt_scaler: MinMaxScaler()
opt_model:GradientBoostingClassifier()
opt_score:0.8574716304043987
opt_scaler: MinMaxScaler()
opt_model:GradientBoostingClassifier()
opt_score:0.8585161877534656
{'opt': 'drop', 'score': 0.8585161877534656, 'param': {'model': GradientBoostingClassifier(), 'scaler': MinMaxScaler()}}


In [228]:
opt_opt = opt_params['opt']
scaler_opt = opt_params['param']['scaler']
model_opt = opt_params['param']['model']

X, y = clearData(df, opt_opt)
X_sparse = ColumnTransformer(transformers=t, remainder="passthrough").fit_transform(X.copy())
X1 = pd.DataFrame(X_sparse.toarray())
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=RANDOM_STATE)
pipe_opt = Pipeline([('scaler', scaler_opt), ('model', model_opt)])
pipe_opt.fit(X_train, y_train)

y_pred = pipe_opt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'accuracy: {accuracy}, f1: {f1}')

accuracy: 0.8650082918739636, f1: 0.6981458590852905
