In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression
from sklearn.metrics import r2_score,precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC, LinearSVC
from sklearn.impute import SimpleImputer

In [2]:
RANDOM_STATE = 42

In [3]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

### Задание
#### 1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    X
    ,y
    ,test_size = 0.2
    ,random_state = RANDOM_STATE
)

### Задание
#### 2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [5]:
models = {
    'Linear regression': LinearRegression().fit(x_train, y_train),
    'Ridge regression': Ridge().fit(x_train, y_train),
    'Lasso regression': Lasso().fit(x_train, y_train)
}
print(f'Model\t\t'+'| R2 SCORE\n'+'-----'*10)
for key in models.keys():
    model_predict = models[key].predict(x_test)
    r2 = r2_score(model_predict, y_test)
    print(f'{key} | {r2}\n'+'-----'*10)

Model		| R2 SCORE
--------------------------------------------------
Linear regression | 0.6333247469014311
--------------------------------------------------
Ridge regression | 0.6316692350060941
--------------------------------------------------
Lasso regression | 0.5579020708855842
--------------------------------------------------


In [9]:
models = {
    'Linear regression': LinearRegression().fit(x_train, y_train),
    'Ridge regression': Ridge().fit(x_train, y_train),
    'Lasso regression': Lasso().fit(x_train, y_train)
}
data_list = []
for key in models.keys():
    model_predict = models[key].predict(x_test)
    r2 = r2_score(model_predict, y_test)
    res_dict = {
        'Model': key,
        'R2 score': r2
    }
    data_list.append(res_dict)
res_df = pd.DataFrame(data_list)

In [10]:
res_df

Unnamed: 0,Model,R2 score
0,Linear regression,0.633325
1,Ridge regression,0.631669
2,Lasso regression,0.557902


### Задание
#### 3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [12]:
coeffs = [10**i for i in range(-5,6)]
models = {
    'Ridge regression': RidgeCV(alphas = coeffs).fit(x_train, y_train),
    'Lasso regression': LassoCV(alphas = coeffs).fit(x_train, y_train)
}
data_list = []
for key in models.keys():
    res_dict = {
        'Model': key,
        'Estimated reqularization parameter': models[key].alpha_,
        'R2 score': models[key].score(x_test, y_test)
    }
    data_list.append(res_dict)
res_df = pd.DataFrame(data_list)

In [13]:
res_df.head()

Unnamed: 0,Model,Estimated reqularization parameter,R2 score
0,Ridge regression,0.01,0.668751
1,Lasso regression,1e-05,0.66876


### Ответ
После подбора и применения оптимального коэффициента регуляризации в моделях для тестовой выборки увеличился коэффициент детерминации (R2)

### Задание
#### 4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [14]:
models = {
    'Ridge regression': Ridge(),
    'Lasso regression': Lasso()
}
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

data_list = []
for model_name in models.keys():
    for scaler_name in scalers.keys():
        r2 = Pipeline([('Scaler',scalers[scaler_name]),('Model',models[model_name])]).fit(x_train, y_train).score(x_test, y_test)
        res_dict = {
            'Model': model_name,
            'Scaler': scaler_name,
            'R2 score': r2
        }
        data_list.append(res_dict)
res_df = pd.DataFrame(data_list)

In [15]:
res_df

Unnamed: 0,Model,Scaler,R2 score
0,Ridge regression,StandardScaler,0.668462
1,Ridge regression,MinMaxScaler,0.67641
2,Lasso regression,StandardScaler,0.623943
3,Lasso regression,MinMaxScaler,0.257392


## Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Уменьшается
Ridge     | MinMaxScaler    | Увеличивается
Lasso      | StandardScaler     | Уменьшается
Lasso  | MinMaxScaler  | Уменьшается

### Задание
#### 5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [16]:
coeffs = [10**i for i in range(-5,6)]
models = {
    'Ridge regression': RidgeCV(alphas = coeffs),
    'Lasso regression': LassoCV(alphas = coeffs)
}
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}
data_list = []

for model_name in models.keys():
    for scaler_name in scalers.keys():
        r2 = Pipeline([('Scaler',scalers[scaler_name]),('Model',models[model_name])]).fit(x_train, y_train).score(x_test, y_test)
        res_dict = {
            'Model': model_name,
            'Scaler': scaler_name,
            'R2 score': r2
        }
        data_list.append(res_dict)
res_df = pd.DataFrame(data_list)

In [17]:
res_df

Unnamed: 0,Model,Scaler,R2 score
0,Ridge regression,StandardScaler,0.665968
1,Ridge regression,MinMaxScaler,0.670031
2,Lasso regression,StandardScaler,0.668759
3,Lasso regression,MinMaxScaler,0.668761


## Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Уменьшается
Ridge     | MinMaxScaler    | Уменьшается
Lasso      | StandardScaler     | Увеличивается
Lasso  | MinMaxScaler  | Увеличивается


### Задание
#### 6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [18]:
poly_x_train = PolynomialFeatures(2).fit_transform(x_train)
poly_x_test = PolynomialFeatures(2).fit_transform(x_test)

models = {
          'Ridge regression': RidgeCV(alphas=coeffs),
          'Lasso regression': LassoCV(alphas=coeffs)
}
scalers = {
    'StandardScaler' : StandardScaler(),
    'MinMaxScaler' : MinMaxScaler()
}


for model_name in models.keys():
    for scaler_name in scalers.keys():
        r2 = Pipeline([('Scaler', scalers[scaler_name]),('Model', models[model_name])]).fit(poly_x_train, y_train).score(poly_x_test, y_test)
        res_dict = {
            'Model': model_name,
            'Scaler': scaler_name,
            'R2 score': r2
        }
        data_list.append(res_dict)
res_df = pd.DataFrame(data_list)

In [19]:
res_df

Unnamed: 0,Model,Scaler,R2 score
0,Ridge regression,StandardScaler,0.665968
1,Ridge regression,MinMaxScaler,0.670031
2,Lasso regression,StandardScaler,0.668759
3,Lasso regression,MinMaxScaler,0.668761
4,Ridge regression,StandardScaler,0.847179
5,Ridge regression,MinMaxScaler,0.845857
6,Lasso regression,StandardScaler,0.83468
7,Lasso regression,MinMaxScaler,0.825979


### Ответ
Модель      | Метод масштабирования | Коэффициент детерминации (R2)
:-------- |:-----:| -------:
Ridge  | StandardScaler  | Увеличивается
Ridge     | MinMaxScaler    | Увеличивается
Lasso      | StandardScaler     | Увеличивается
Lasso  | MinMaxScaler  | Увеличивается

### Задание
#### 7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [20]:
def search_best_model(polynom_degree, scaler, model, coeff):
    
    poly_x_train = PolynomialFeatures(polynom_degree).fit_transform(x_train)
    poly_x_test = PolynomialFeatures(polynom_degree).fit_transform(x_test)
    
    pipeline =  Pipeline([('Scaler',scaler),('Model',model)])
    parameters = {'Model__alpha': [coeff]}
    
    grid = GridSearchCV(pipeline, parameters,n_jobs=-1, cv=6)
    r2 = grid.fit(poly_x_train, y_train).score(poly_x_test, y_test)
    return r2

In [22]:
polynom_degrees = [i for i in range(1,5)]

models = {
    'Ridge regression': Ridge(),
    'Lasso regression': Lasso()
}

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

coeffs = [10**i for i in range(-5,6)]

data_list = []
for model_name in models:
    for scaler_name in scalers:
        for polynom_degree in polynom_degrees:
            for coeff in coeffs:
                r2 = search_best_model(polynom_degree, scalers[scaler_name], models[model_name], coeff)
                res_dict = {
                    'Model': model_name,
                    'Scaler': scaler_name,
                    'Polynom deegree': polynom_degree,
                    'Reqularization coeff': coeff,
                    'R2 score': r2
                }
                data_list.append(res_dict)
res_df = pd.DataFrame(data_list).sort_values(by = 'R2 score', ascending = False)

In [23]:
res_df

Unnamed: 0,Model,Scaler,Polynom deegree,Reqularization coeff,R2 score
167,Lasso regression,MinMaxScaler,4,0.00100,0.876281
81,Ridge regression,MinMaxScaler,4,0.10000,0.874223
124,Lasso regression,StandardScaler,4,0.01000,0.873650
82,Ridge regression,MinMaxScaler,4,1.00000,0.868482
166,Lasso regression,MinMaxScaler,4,0.00010,0.867584
...,...,...,...,...,...
140,Lasso regression,MinMaxScaler,1,1000.00000,-0.023341
175,Lasso regression,MinMaxScaler,4,100000.00000,-0.023341
77,Ridge regression,MinMaxScaler,4,0.00001,-0.443837
34,Ridge regression,StandardScaler,4,0.00010,-1.881078


http://archive.ics.uci.edu/ml/datasets/Adult

In [24]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [25]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Задание
#### 8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [26]:
x, y = data.loc[:,:13], data[14]

In [27]:
y.value_counts()

<=50K    37155
>50K     11687
Name: 14, dtype: int64

### Задание
#### 9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       48842 non-null  int64 
 1   1       48842 non-null  object
 2   2       48842 non-null  int64 
 3   3       48842 non-null  object
 4   4       48842 non-null  int64 
 5   5       48842 non-null  object
 6   6       48842 non-null  object
 7   7       48842 non-null  object
 8   8       48842 non-null  object
 9   9       48842 non-null  object
 10  10      48842 non-null  int64 
 11  11      48842 non-null  int64 
 12  12      48842 non-null  int64 
 13  13      48842 non-null  object
 14  14      48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.9+ MB


### Ответ
#### В данных отсутствуют пропуски. 

### Задание
#### 10. Выберите колонки с числовыми и категориальными переменными.

In [29]:
number_columns = data.select_dtypes(include='number')
number_columns[:2]

Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13


In [30]:
categorical_columns = data.select_dtypes(include = 'object')
categorical_columns[:2]

Unnamed: 0,1,3,5,6,7,8,9,13,14
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K


### Задание
#### 11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [31]:
def pipeline(df):
    numeric_matrix = MinMaxScaler().fit_transform(
        df.select_dtypes(include = 'number')
    )    
    
    categorical_matrix = OneHotEncoder().fit_transform(
        df.select_dtypes(include = 'object')
    ).toarray()
    
    res_matrix = pd.DataFrame(np.concatenate((numeric_matrix, categorical_matrix),axis=1))
    return res_matrix

In [32]:
df = pipeline(data)

In [33]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,100,101,102,103,104,105,106,107,108,109
0,0.30137,0.044131,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.452055,0.048052,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.287671,0.137581,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.493151,0.150486,0.4,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.150685,0.220635,0.8,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Задание
#### 12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [34]:
df = data.copy()
y = df.iloc[:,14].replace(['<=50K','>50K'],[0,1])
acc = accuracy_score(y, np.array([0 for _ in range(len(y))]))
f1 = f1_score(y, np.array([0 for _ in range(len(y))]))
print(f'Accuracy: {acc}, F1 score: {f1}')

Accuracy: 0.7607182343065395, F1 score: 0.0


### Задание
### 13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [35]:
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(gamma='auto'),
    'LinearSVC': LinearSVC()
}

def f1_scorer(model,x,y):
    return f1_score(y, y2, average='binary',pos_label ='<=50K')

metrics = {
    'Accuracy metric':'accuracy',
    'F1 metric' : f1_scorer
}

In [36]:
x, y = df.loc[:,:13], data[14]
pipl_x = pipeline(x)
y2 = ['<=50K']*int((len(pipl_x)/2))

data_list = []
for model_key in models.keys():
    for metric_key in metrics.keys():
        cross_vl_score = cross_val_score(
            models[model_key],
            pipl_x,
            y,
            scoring= metrics[metric_key],
            cv=2
        )
        param_dict = {
            'Model' : model_key,
            'Metric' : metric_key,
            'Metric score' : cross_vl_score[0],
        }
        data_list.append(param_dict)
res_frame = pd.DataFrame(data_list)

In [37]:
res_frame.head(10)

Unnamed: 0,Model,Metric,Metric score
0,LogisticRegression,Accuracy metric,0.849638
1,LogisticRegression,F1 metric,0.864113
2,SVC,Accuracy metric,0.836247
3,SVC,F1 metric,0.864113
4,LinearSVC,Accuracy metric,0.851439
5,LinearSVC,F1 metric,0.864113


### Задание
#### 14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [38]:
df2 = data.copy()
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
filled_data = pd.DataFrame(imp.fit_transform(df2))

### Задание
#### 16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [39]:
df3  = data.copy()
rows_with_questions = (df3 == '?').any(axis=1)
free_data = df3.drop(rows_with_questions[rows_with_questions].index, axis=0)

x, y = free_data[free_data.columns[:-1]], free_data[14]
pipl_x = pipeline(x)

y2 = ['<=50K']*int((len(pipl_x)/2))

data_list = []
for model_key in models.keys():
    for metric_key in metrics.keys():
        cross_score = cross_val_score(
            models[model_key],
            pipl_x,
            y,
            scoring=metrics[metric_key],
            cv=2
        )
        param_dict = {
            'Model': model_key,
            'Metric': metric_key,
            'Score': cross_score[0]
        }
        data_list.append(param_dict)
res_df = pd.DataFrame(data_list)

In [40]:
res_df

Unnamed: 0,Model,Metric,Score
0,LogisticRegression,Accuracy metric,0.845739
1,LogisticRegression,F1 metric,0.858549
2,SVC,Accuracy metric,0.8311
3,SVC,F1 metric,0.858549
4,LinearSVC,Accuracy metric,0.848392
5,LinearSVC,F1 metric,0.858549


### Задание
#### 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [45]:
RANDOM_STATE = 42
models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

data_list = []
for model_key in models.keys():
    for metric_key in metrics.keys():
        cross_score = cross_val_score(
            models[model_key],
            pipl_x,
            y,
            scoring=metrics[metric_key],
            cv=2
        )
        param_dict = {
            'Model': model_key,
            'Metric': metric_key,
            'Score': cross_score[0]
        }
        data_list.append(param_dict)
res_df = pd.DataFrame(data_list)

In [46]:
res_df

Unnamed: 0,Model,Metric,Score
0,RandomForestClassifier,Accuracy metric,0.847464
1,RandomForestClassifier,F1 metric,0.858549
2,GradientBoostingClassifier,Accuracy metric,0.860864
3,GradientBoostingClassifier,F1 metric,0.858549


### Вывод
Для данного набора данных применение представленных моделей  без настройки параметров не дает прироста в качестве
Результаты сравнимы. Вывод - применение ансамблевых моделей без


### Задание
#### 18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [43]:
RANDOM_STATE = 42
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(gamma='auto'),
    'LinearSVC': LinearSVC(),
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE)
}


pipl_x = pipeline(x)
y2 = ['<=50K']*int((len(pipl_x)/2))

datasets = {
    'x':x,
    'pipelened' :pipeline(x)
}
data = []
for dataset_key in datasets:
    for model_key in models:
        c_val_accuracy_score = cross_val_score(models[model_key],
                                               pipeline(x),
                                               y,
                                               scoring='accuracy',
                                               cv = 2)
        c_val_f1_score = cross_val_score(models[model_key],
                                         pipeline(x),
                                         y,
                                         scoring = f1_scorer,
                                         cv = 2)
        param_dict = {
            'Dataset': dataset_key,
            'Model': model_key,
            'Accuracy_score': c_val_accuracy_score[0],
            'F1_score': c_val_f1_score[0]
        }
        data.append(param_dict)
res_frame = pd.DataFrame(data)

In [44]:
res_frame.sort_values(by = 'Accuracy_score', ascending=False).head(1)

Unnamed: 0,Dataset,Model,Accuracy_score,F1_score
4,x,GradientBoostingClassifier,0.860864,0.858549
