In [1]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

In [None]:
RANDOM_STATE = 42

In [None]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

### Задание
#### 1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    X
    ,y
    ,test_size = 0.2
    ,random_state = RANDOM_STATE
)

### Задание
#### 2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

In [None]:
models = {
    'Linear regression': LinearRegression().fit(x_train, y_train),
    'Ridge regression': Ridge().fit(x_train, y_train),
    'Lasso regression': Lasso().fit(x_train, y_train)
}
print(f'Model\t\t'+'| R2 SCORE\n'+'-----'*10)
for key in models.keys():
    model_predict = models[key].predict(x_test)
    r2 = r2_score(model_predict, y_test)
    print(f'{key} | {r2}\n'+'-----'*10)

### Задание
#### 3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.linear_model import RidgeCV, LassoCV

In [None]:
coeffs = [10**i for i in range(-5,6)]
models = {
    'Ridge regression': RidgeCV(alphas = coeffs).fit(x_train, y_train),
    'Lasso regression': LassoCV(alphas = coeffs).fit(x_train, y_train)
}
print(f'Model\t\t'+'| Estimated regularization parameter | R2 SCORE\n'+'-----'*16)
for key in models.keys():
    print(f'{key} |\t\t{models[key].alpha_}\t\t| {models[key].score(x_test, y_test)}\n'+'-----'*15)

### Ответ
После подбора и применения оптимального коэффициента регуляризации в моделях для тестовой выборки увеличился коэффициент детерминации (R2)

### Задание
#### 4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
models = {
    'Ridge regression': Ridge(),
    'Lasso regression': Lasso()
}
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

print('Model\t\t\t|\tScaler\t\t|\tR2 score\n'+'-----'*15)
for model_name in models.keys():
    for scaler_name in scalers.keys():
        r2 = Pipeline([('Scaler',scalers[scaler_name]),('Model',models[model_name])]).fit(x_train, y_train).score(x_test, y_test)
        print(f'{model_name}\t|\t{scaler_name}\t|\t{r2}\n'+'-----'*15)

## Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Уменьшается
Ridge     | MinMaxScaler    | Увеличивается
Lasso      | StandardScaler     | Уменьшается
Lasso  | MinMaxScaler  | Уменьшается

### Задание
#### 5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
coeffs = [10**i for i in range(-5,6)]
models = {
    'Ridge regression': RidgeCV(alphas = coeffs),
    'Lasso regression': LassoCV(alphas = coeffs)
}
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

print('Model\t\t\t|\tScaler\t\t|\tR2 score\n'+'-----'*15)
for model_name in models.keys():
    for scaler_name in scalers.keys():
        r2 = Pipeline([('Scaler',scalers[scaler_name]),('Model',models[model_name])]).fit(x_train, y_train).score(x_test, y_test)
        print(f'{model_name}\t|\t{scaler_name}\t|\t{r2}\n'+'-----'*15)

## Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Уменьшается
Ridge     | MinMaxScaler    | Уменьшается
Lasso      | StandardScaler     | Увеличивается
Lasso  | MinMaxScaler  | Увеличивается


### Задание
#### 6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")

In [None]:
poly_x_train = PolynomialFeatures(2).fit_transform(x_train)
poly_x_test = PolynomialFeatures(2).fit_transform(x_test)

models = {
          'Ridge regression': RidgeCV(alphas=coeffs),
          'Lasso regression': LassoCV(alphas=coeffs)
}
scalers = {
    'StandardScaler' : StandardScaler(),
    'MinMaxScaler' : MinMaxScaler()
}

print('Model\t\t\t|\tScaler\t\t|\tR2 score\n'+'-----'*15)
for model_name in models:
    for scaler_name in scalers:
        r2 = Pipeline([('Scaler', scalers[scaler_name]),('Model', models[model_name])]).fit(poly_x_train, y_train).score(poly_x_test, y_test)
        print(f'{model_name}\t|\t{scaler_name}\t|\t{r2}\n'+'-----'*15)

### Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Увеличивается
Ridge     | MinMaxScaler    | Увеличивается
Lasso      | StandardScaler     | Увеличивается
Lasso  | MinMaxScaler  | Увеличивается

### Задание
#### 7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
def search_best_model(polynom_degree, scaler, model, coeff):
    
    poly_x_train = PolynomialFeatures(polynom_degree).fit_transform(x_train)
    poly_x_test = PolynomialFeatures(polynom_degree).fit_transform(x_test)
    
    pipeline =  Pipeline([('Scaler',scaler),('Model',model)])
    parameters = {'Model__alpha': [coeff]}
    
    grid = GridSearchCV(pipeline, parameters,n_jobs=-1, cv=6)
    r2 = grid.fit(poly_x_train, y_train).score(poly_x_test, y_test)
    return r2

In [None]:
polynom_degrees = [i for i in range(1,5)]

models = {
    'Ridge regression': Ridge(),
    'Lasso regression': Lasso()
}

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

coeffs = [10**i for i in range(-5,6)]

res_dict = {'Model':[],'Scaler':[], 'Polynom_deegre':[],'Regularization_coeff':[], 'R2':[] }
for model_name in models:
    for scaler_name in scalers:
        for polynom_degree in polynom_degrees:
            for coeff in coeffs:
                r2 = search_best_model(polynom_degree, scalers[scaler_name], models[model_name], coeff)
                res_dict['Model'].append(model_name)
                res_dict['Scaler'].append(scaler_name)
                res_dict['Polynom_deegre'].append(polynom_degree)
                res_dict['Regularization_coeff'].append(coeff)
                res_dict['R2'].append(r2)

In [None]:
df = pd.DataFrame(res_dict)
df.sample(15)

In [None]:
res_df = df.sort_values('R2', ascending=False).drop_duplicates(['Model'])
res_df.head()

http://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Задание
#### 8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [4]:
x, y = data.loc[:,:13], data[14]

In [32]:
y.value_counts()

<=50K    37155
>50K     11687
Name: 14, dtype: int64

### Задание
#### 9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       48842 non-null  int64 
 1   1       48842 non-null  object
 2   2       48842 non-null  int64 
 3   3       48842 non-null  object
 4   4       48842 non-null  int64 
 5   5       48842 non-null  object
 6   6       48842 non-null  object
 7   7       48842 non-null  object
 8   8       48842 non-null  object
 9   9       48842 non-null  object
 10  10      48842 non-null  int64 
 11  11      48842 non-null  int64 
 12  12      48842 non-null  int64 
 13  13      48842 non-null  object
 14  14      48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.9+ MB


### Ответ
#### В данных отсутствуют пропуски. 

### Задание
#### 10. Выберите колонки с числовыми и категориальными переменными.

In [7]:
number_columns = data.select_dtypes(include='number')
number_columns[:2]

Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13


In [8]:
categorical_columns = data.select_dtypes(include = 'object')
categorical_columns[:2]

Unnamed: 0,1,3,5,6,7,8,9,13,14
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K


### Задание
#### 11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [9]:
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler

In [50]:
def pipeline(df):
    numeric_matrix = MinMaxScaler().fit_transform(
        df.select_dtypes(include = 'number')
    )    
    
    categorical_matrix = OneHotEncoder().fit_transform(
        df.select_dtypes(include = 'object')
    ).toarray()
    
    res_matrix = pd.DataFrame(np.concatenate((numeric_matrix, categorical_matrix),axis=1))
    return res_matrix

In [11]:
df = pd.DataFrame(pipeline(data))

In [12]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,100,101,102,103,104,105,106,107,108,109
0,0.30137,0.044131,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.452055,0.048052,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.287671,0.137581,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.493151,0.150486,0.4,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.150685,0.220635,0.8,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

### Задание
### 13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [44]:
import warnings
warnings.filterwarnings("ignore")

In [45]:
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(gamma='auto'),
    'LinearSVC': LinearSVC()
}

x, y = data.loc[:,:13], data[14]
pipl_x = pipeline(x)
y2 = ['<=50K']*int((len(pipl_x)/2))

def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')


for model_key in models.keys():
    cross_val_accuracy_score = cross_val_score(models[model_key],pipl_x, y, scoring= 'accuracy', cv=2)
    print(f'Model:{model_key}, Cross val accuracy score:{cross_val_accuracy_score}')
    cross_val_f1_score =  cross_val_score(models[model_key], pipl_x, y, scoring=f1_scorer, cv=2)
    print(f'Model: {model_key}, Cross val f1 score:{cross_val_f1_score}')


Model:LogisticRegression, Cross val accuracy score:[0.84963761 0.85070226]
Model: LogisticRegression, Cross val f1 score:[0.86411312 0.8640867 ]
Model:SVC, Cross val accuracy score:[0.83624749 0.83518283]
Model: SVC, Cross val f1 score:[0.86411312 0.8640867 ]
Model:LinearSVC, Cross val accuracy score:[0.85143933 0.85188977]
Model: LinearSVC, Cross val f1 score:[0.86411312 0.8640867 ]


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [13]:
from sklearn.impute import SimpleImputer

In [52]:
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
filled_data = imp.fit_transform(data)

15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [54]:
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(gamma='auto'),
    'LinearSVC': LinearSVC()
}

x, y = filled_data[filled_data.columns[:-1]], filled_data[14]
pipl_x = pipeline(x)
y2 = ['<=50K']*int((len(pipl_x)/2))

def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')


for model_key in models.keys():
    cross_val_accuracy_score = cross_val_score(models[model_key],pipl_x, y, scoring= 'accuracy', cv=2)
    print(f'Model:{model_key}, Cross val accuracy score:{cross_val_accuracy_score}')
    cross_val_f1_score =  cross_val_score(models[model_key], pipl_x, y, scoring=f1_scorer, cv=2)
    print(f'Model: {model_key}, Cross val f1 score:{cross_val_f1_score}')

Model:LogisticRegression, Cross val accuracy score:[0.84963761 0.85070226]
Model: LogisticRegression, Cross val f1 score:[0.86411312 0.8640867 ]
Model:SVC, Cross val accuracy score:[0.83624749 0.83518283]
Model: SVC, Cross val f1 score:[0.86411312 0.8640867 ]
Model:LinearSVC, Cross val accuracy score:[0.85143933 0.85188977]
Model: LinearSVC, Cross val f1 score:[0.86411312 0.8640867 ]


16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [55]:
rows_with_questions = (data == '?').any(axis=1)
free_data = data.drop(rows_with_questions[rows_with_questions].index, axis=0)

x, y = free_data[free_data.columns[:-1]], free_data[14]
pipl_x = pipeline(x)

y2 = ['<=50K']*int((len(pipl_x)/2))

def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')

for model_key in models.keys():
    cross_val_accuracy_score = cross_val_score(models[model_key],pipl_x, y, scoring= 'accuracy', cv=2)
    print(f'Model:{model_key}, Cross val accuracy score:{cross_val_accuracy_score}')
    cross_val_f1_score =  cross_val_score(models[model_key], pipl_x, y, scoring=f1_scorer, cv=2)
    print(f'Model: {model_key}, Cross val f1 score:{cross_val_f1_score}')

Model:LogisticRegression, Cross val accuracy score:[0.8457388 0.8459157]
Model: LogisticRegression, Cross val f1 score:[0.85854914 0.85854914]
Model:SVC, Cross val accuracy score:[0.83109991 0.83003848]
Model: SVC, Cross val f1 score:[0.85854914 0.85854914]
Model:LinearSVC, Cross val accuracy score:[0.84839238 0.84728672]
Model: LinearSVC, Cross val f1 score:[0.85854914 0.85854914]


 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [57]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

RANDOM_STATE = 42
models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE)
}


def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')

for model_key in models.keys():
    cross_val_accuracy_score = cross_val_score(models[model_key],pipl_x, y, scoring= 'accuracy', cv=2)
    print(f'Model:{model_key}, Cross val accuracy score:{cross_val_accuracy_score}')
    cross_val_f1_score =  cross_val_score(models[model_key], pipl_x, y, scoring=f1_scorer, cv=2)
    print(f'Model: {model_key}, Cross val f1 score:{cross_val_f1_score}')


# Результаты сравнимы. Вывод - применение ансамблевых моделей без
# настройки гиперпараметров не дает значительного прироста качества
# классификации на данном датасете

Model:RandomForestClassifier, Cross val accuracy score:[0.84746362 0.84883464]
Model: RandomForestClassifier, Cross val f1 score:[0.85854914 0.85854914]
Model:GradientBoostingClassifier, Cross val accuracy score:[0.86086418 0.86519835]
Model: GradientBoostingClassifier, Cross val f1 score:[0.85854914 0.85854914]


18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [58]:
RANDOM_STATE = 42
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(gamma='auto'),
    'LinearSVC': LinearSVC(),
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')

pipl_x = pipeline(x)
y2 = ['<=50K']*int((len(pipl_x)/2))

datasets = {
    'x':x,
    'pipelened' :pipeline(x)
}

for dataset_key in datasets:
    for model_key in models:
        print()
        c_val_accuracy_score = cross_val_score(models[model_key],
                                               datasets[dataset_key],
                                               y,
                                               scoring='accuracy',
                                               cv = 2)
        c_val_f1_score = cross_val_score(models[model_key],
                                         datasets[dataset_key],
                                         y,
                                         scoring = f1_scorer,
                                         cv = 2)
        print(f'Dataset:{dataset_key}, Model: {model_key}, Accuracy score: {c_val_accuracy_score}, F1 score: {c_val_f1_score}\n')



Dataset:x, Model: LogisticRegression, Accuracy score: [nan nan], F1 score: [nan nan]

Dataset:x, Model: SVC, Accuracy score: [nan nan], F1 score: [nan nan]

Dataset:x, Model: LinearSVC, Accuracy score: [nan nan], F1 score: [nan nan]

Dataset:x, Model: RandomForestClassifier, Accuracy score: [nan nan], F1 score: [nan nan]

Dataset:x, Model: GradientBoostingClassifier, Accuracy score: [nan nan], F1 score: [nan nan]

Dataset:pipelened, Model: LogisticRegression, Accuracy score: [0.8457388 0.8459157], F1 score: [0.85854914 0.85854914]



KeyboardInterrupt: 