In [1]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

In [2]:
RANDOM_STATE = 42

In [3]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [4]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train , y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)

#print("Coefficients:\n", lr.coef_)
print(f"R2: {r2:.3f}")

R2: 0.669


In [7]:
ridge = Ridge()
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.3f}")


R2: 0.666


In [8]:
ls = Lasso()
ls.fit(X_train, y_train)

y_pred = ls.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2: {r2:.3f}")

R2: 0.667


3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV , LassoCV

In [10]:
parameters = [10**i for i in range(-5,6,1)]
rgs = GridSearchCV(ridge, {'alpha':parameters}, scoring='r2')
rcv = RidgeCV(alphas=parameters, scoring='r2')
lgs = GridSearchCV(ls, {'alpha':parameters}, scoring='r2')
lcv = LassoCV(alphas=parameters)

In [11]:
rgs.fit(X_train,y_train)
rcv.fit(X_train,y_train)
lgs.fit(X_train,y_train)
lcv.fit(X_train,y_train)


In [12]:
print(rgs.best_params_)
print(rgs.best_estimator_.score(X_test,y_test))

rcv.score(X_test,y_test)

{'alpha': 1e-05}
0.6687594856409733


0.668750999876409

In [13]:
print(lgs.best_params_)
print(lgs.best_estimator_.score(X_test,y_test))

lcv.score(X_test,y_test)

{'alpha': 1e-05}
0.6687598638315153


0.6687598638315153

4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , MinMaxScaler

std_scale = StandardScaler()
mm_scale = MinMaxScaler()

pipe_std = Pipeline([('std', StandardScaler()), ('lr', LinearRegression()) ])
pipe_mm =  Pipeline([('min_max', MinMaxScaler()), ('lr', LinearRegression()) ])

pipe_std.fit(X_train, y_train)
r2_std = pipe_std.score(X_test,y_test)
print(f"StandardScaler R2 : {r2_std:.12f}")

pipe_mm.fit(X_train, y_train)
r2_mm = pipe_mm.score(X_test,y_test)
print(f"MinMax Scaler R2 : {r2_mm:.12f}")

StandardScaler R2 : 0.668759493536
MinMax Scaler R2 : 0.668759493536


5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [15]:
for i in [rgs,lgs,rcv,lcv] :
    pipe = Pipeline([('std', StandardScaler()), ('min_max',MinMaxScaler()), ('lgs', i) ])
    pipe.fit(X_train, y_train)
    r2 = pipe.score(X_test,y_test)
    print(f"For {i:} \n R2 : {r2:.8f}")
    

For GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                   1000, 10000, 100000]},
             scoring='r2') 
 R2 : 0.67003100
For GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                   1000, 10000, 100000]},
             scoring='r2') 
 R2 : 0.66876051
For RidgeCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,
                100000],
        scoring='r2') 
 R2 : 0.67003100
For LassoCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,
                100000]) 
 R2 : 0.66876051




6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [16]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)

for i in [rgs,lgs,rcv,lcv] :
    pipe = Pipeline([('std', StandardScaler()), ('min_max',MinMaxScaler()), ('poly', poly), ('lgs', i) ])
    pipe.fit(X_train, y_train)
    r2 = pipe.score(X_test,y_test)
    print(f"For {i:} \n R2 : {r2:.8f}")

For GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                   1000, 10000, 100000]},
             scoring='r2') 
 R2 : 0.85006304


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


For GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                   1000, 10000, 100000]},
             scoring='r2') 
 R2 : 0.83905817


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


For RidgeCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,
                100000],
        scoring='r2') 
 R2 : 0.85006304


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


For LassoCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,
                100000]) 
 R2 : 0.83905817


  model = cd_fast.enet_coordinate_descent(


7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [17]:
import itertools

p_degree = [ i for i in range(1,5)]
parameters = [10**i for i in range(-5,6,1)]
rgs = GridSearchCV(ridge, {'alpha':parameters, 'tol' : [0.1,0.5]}, scoring='r2')
lgs = GridSearchCV(ls, {'alpha':parameters, 'tol' : [0.1,0.5]}, scoring='r2')


for i, k in list(itertools.product([rgs,lgs], p_degree)) :
    pipe = Pipeline([('std', StandardScaler()), ('poly', PolynomialFeatures(k)), ('lgs', i) ])
    pipe.fit(X_train, y_train)
    r2 = pipe.score(X_test,y_test)
    print(f"For degree = {k:} and {i.best_params_:} \n R2 : {r2:.8f}")

For degree = 1 and {'alpha': 1, 'tol': 0.1} 
 R2 : 0.66846244
For degree = 2 and {'alpha': 10, 'tol': 0.1} 
 R2 : 0.81804659
For degree = 3 and {'alpha': 100, 'tol': 0.1} 
 R2 : 0.78630077
For degree = 4 and {'alpha': 1000, 'tol': 0.1} 
 R2 : 0.73421217
For degree = 1 and {'alpha': 0.0001, 'tol': 0.1} 
 R2 : 0.66868364
For degree = 2 and {'alpha': 0.1, 'tol': 0.1} 
 R2 : 0.78642922
For degree = 3 and {'alpha': 0.1, 'tol': 0.1} 
 R2 : 0.84458053
For degree = 4 and {'alpha': 0.1, 'tol': 0.5} 
 R2 : 0.82763107


In [None]:
# Best results  with  poly degree = 3   and  alpha = 0.1 with L1 regularisation

http://archive.ics.uci.edu/ml/datasets/Adult

In [18]:
import pandas as pd
import numpy as np

In [19]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [20]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [21]:
cols = data.columns
X = data[cols[:-1]]
y = data[cols[-1:]].applymap(lambda x: 0 if str(x) == "<=50K" else 1)

9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [22]:
from sklearn.impute import SimpleImputer
imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')


10. Выберите колонки с числовыми и категориальными переменными.

In [23]:
int_cols = []
cat_cols = []
for i in X.columns :
    if X[i].dtypes == 'int64' :
        int_cols.append(i)
    else:
        cat_cols.append(i) 

In [24]:
print('int_cols_idx :' + str(int_cols))
print('cat_cols_idx :' + str(cat_cols))

int_cols_idx :[0, 2, 4, 10, 11, 12]
cat_cols_idx :[1, 3, 5, 6, 7, 8, 9, 13]


11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

ohe_scale = OneHotEncoder(dtype='object')
mm_scale = MinMaxScaler()
ct = ColumnTransformer([('int_t', mm_scale, int_cols),('cat_t', ohe_scale , cat_cols)])

pipe = Pipeline([('imp_freq', imp_freq),('ct', ct)])

12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [26]:
from sklearn.metrics import f1_score , accuracy_score
# assume that prediction is based on most frequent class = 0 
y_pred = pd.DataFrame(np.zeros(y.shape[0]))

print(f"f1_score: {f1_score(y_pred, y)}")
print(f"accuracy_score: {accuracy_score(y_pred,y)}")

f1_score: 0.0
accuracy_score: 0.7607182343065395


13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [27]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=200, penalty="elasticnet", solver="saga", l1_ratio=0.5)
l_svc = LinearSVC()
svc = SVC()

pipe_lr = Pipeline([('imp_freq', imp_freq),('ct', ct),('lr', lr)])
pipe_l_svc = Pipeline([('imp_freq', imp_freq),('ct', ct),('l_svc', l_svc)])
pipe_svc = Pipeline([('imp_freq', imp_freq),('ct', ct),('svc', svc)])

In [28]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train , y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)


In [29]:
cvs_lr_f1 = cross_val_score(pipe_lr, X_train, y_train[14],scoring="f1")
cvs_lr_acc = cross_val_score(pipe_lr, X_train, y_train[14],scoring="accuracy")

In [30]:
cvs_svc_f1 = cross_val_score(pipe_svc, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_svc_acc = cross_val_score(pipe_svc, X_train, y_train[14],scoring="accuracy",n_jobs=4, cv=3)

In [31]:
cvs_l_svc_f1 = cross_val_score(pipe_l_svc, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_l_svc_acc = cross_val_score(pipe_l_svc, X_train, y_train[14],scoring="accuracy",n_jobs=4,cv=3)

In [32]:
print( f'LogisticRegression accuracy mean is : {cvs_lr_acc.mean()}')
print( f'LogisticRegression f1 mean is : {cvs_lr_f1.mean()}')

print( f'SVC accuracy mean is : {cvs_svc_acc.mean()}')
print( f'SVC f1 mean is : {cvs_svc_f1.mean()}')

print( f'Linear SVC accuracy mean is : {cvs_l_svc_acc.mean()}')
print( f'Linear SVC f1 mean is : {cvs_l_svc_f1.mean()}')

LogisticRegression accuracy mean is : 0.851764555997315
LogisticRegression f1 mean is : 0.6574162649227585
SVC accuracy mean is : 0.8395822074037045
SVC f1 mean is : 0.6150577610294997
Linear SVC accuracy mean is : 0.8524555994661561
Linear SVC f1 mean is : 0.6559364561295021


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [33]:
imp_freq_enh = SimpleImputer(missing_values={np.nan,'?'}, strategy='most_frequent')

In [34]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=200, penalty="elasticnet", solver="saga", l1_ratio=0.5)
l_svc = LinearSVC()
svc = SVC()

pipe_lr_enh = Pipeline([('imp_freq_enh', imp_freq_enh),('ct', ct),('lr', lr)])
pipe_l_svc_enh = Pipeline([('imp_freq_enh', imp_freq_enh),('ct', ct),('l_svc', l_svc)])
pipe_svc_enh = Pipeline([('imp_freq_enh', imp_freq_enh),('ct', ct),('svc', svc)])

15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [35]:
cvs_lr_f1_enh = cross_val_score(pipe_lr_enh, X_train, y_train[14],scoring="f1")
cvs_lr_acc_enh = cross_val_score(pipe_lr_enh, X_train, y_train[14],scoring="accuracy")

cvs_svc_f1_enh = cross_val_score(pipe_svc_enh, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_svc_acc_enh = cross_val_score(pipe_svc_enh, X_train, y_train[14],scoring="accuracy",n_jobs=4, cv=3)

cvs_l_svc_f1_enh = cross_val_score(pipe_l_svc_enh, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_l_svc_acc_enh = cross_val_score(pipe_l_svc_enh, X_train, y_train[14],scoring="accuracy",n_jobs=4,cv=3)

print( f'LogisticRegression accuracy mean is : {cvs_lr_acc_enh.mean()}')
print( f'LogisticRegression f1 mean is : {cvs_lr_f1_enh.mean()}')

print( f'SVC accuracy mean is : {cvs_svc_acc_enh.mean()}')
print( f'SVC f1 mean is : {cvs_svc_f1_enh.mean()}')

print( f'Linear SVC accuracy mean is : {cvs_l_svc_acc_enh.mean()}')
print( f'Linear SVC f1 mean is : {cvs_l_svc_f1_enh.mean()}')

LogisticRegression accuracy mean is : 0.851764555997315
LogisticRegression f1 mean is : 0.6574162649227585
SVC accuracy mean is : 0.8395822074037045
SVC f1 mean is : 0.6150577610294997
Linear SVC accuracy mean is : 0.8524555994661561
Linear SVC f1 mean is : 0.6559364561295021


In [36]:
# score hasn't changed

16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [37]:
import pandas as pd
import numpy as np
RANDOM_STATE = 42

link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

data_cls = data[data.eq("?").any(axis=1) == False].copy(deep=True)

In [38]:
data_cls.shape

(45222, 15)

In [39]:
from sklearn.model_selection import train_test_split

cols = data_cls.columns
X_cls = data_cls[cols[:-1]]
y_cls = data_cls[cols[-1:]].applymap(lambda x: 0 if str(x) == "<=50K" else 1)

X_train , X_test, y_train , y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=RANDOM_STATE)


In [40]:
int_cols_cls = []
cat_cols_cls = []
for i in X_cls.columns :
    if X_cls[i].dtypes == 'int64' :
        int_cols_cls.append(i)
    else:
        cat_cols_cls.append(i) 

In [43]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer


imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

ohe_scale = OneHotEncoder(handle_unknown='ignore')
mm_scale = MinMaxScaler()
ct_cls = ColumnTransformer([('int_t', mm_scale, int_cols_cls),('cat_t', ohe_scale , cat_cols_cls)])

lr = LogisticRegression(max_iter=200, penalty="elasticnet", solver="saga", l1_ratio=0.5)
l_svc = LinearSVC()
svc = SVC()

pipe_lr= Pipeline([('imp_freq', imp_freq),('ct', ct_cls),('lr', lr)])
pipe_l_svc = Pipeline([('imp_freq', imp_freq),('ct', ct_cls),('l_svc', l_svc)])
pipe_svc= Pipeline([('imp_freq', imp_freq),('ct', ct_cls),('svc', svc)])

In [45]:
cvs_lr_f1_cls = cross_val_score(pipe_lr, X_train, y_train[14], scoring= 'f1')
cvs_lr_acc_cls = cross_val_score(pipe_lr, X_train, y_train[14], scoring = 'accuracy')

cvs_svc_f1_cls = cross_val_score(pipe_svc, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_svc_acc_cls = cross_val_score(pipe_svc, X_train, y_train[14],scoring="accuracy",n_jobs=4, cv=3)

cvs_l_svc_f1_cls = cross_val_score(pipe_l_svc, X_train, y_train[14],scoring="f1",n_jobs=4, cv=3)
cvs_l_svc_acc_cls = cross_val_score(pipe_l_svc, X_train, y_train[14],scoring="accuracy",n_jobs=4,cv=3)

print( f'LogisticRegression accuracy mean is : {cvs_lr_acc_cls.mean()}')
print( f'LogisticRegression f1 mean is : {cvs_lr_f1_cls.mean()}')

print( f'SVC accuracy mean is : {cvs_svc_acc_cls.mean()}')
print( f'SVC f1 mean is : {cvs_svc_f1_cls.mean()}')

print( f'Linear SVC accuracy mean is : {cvs_l_svc_acc_cls.mean()}')
print( f'Linear SVC f1 mean is : {cvs_l_svc_f1_cls.mean()}')

LogisticRegression accuracy mean is : 0.8462836053931373
LogisticRegression f1 mean is : 0.6557271370173845
SVC accuracy mean is : 0.8344528291455897
SVC f1 mean is : 0.6189288089503716
Linear SVC accuracy mean is : 0.8466705365287336
Linear SVC f1 mean is : 0.654709321521242


 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [46]:
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

In [47]:
pipe_rfc= Pipeline([('imp_freq', imp_freq),('ct', ct_cls),('rfc', rfc)])
pipe_gbc = Pipeline([('imp_freq', imp_freq),('ct', ct_cls),('gbc', gbc)])

In [48]:
rfc_f1 = cross_val_score(pipe_rfc, X_train, y_train[14], scoring= 'f1', n_jobs=4, cv=3)
rfc_acc = cross_val_score(pipe_rfc, X_train, y_train[14], scoring = 'accuracy', n_jobs=4, cv=3 )

gbc_f1 = cross_val_score(pipe_gbc, X_train, y_train[14], scoring= 'f1', n_jobs=4, cv=3)
gbc_acc = cross_val_score(pipe_gbc, X_train, y_train[14], scoring = 'accuracy', n_jobs=4, cv=3)

print( f'RandomForest accuracy mean is : {rfc_acc.mean()}')
print( f'RandomForest f1 mean is : {rfc_f1.mean()}')

print( f'GradienBoosting accuracy mean is : {gbc_acc.mean()}')
print( f'GradientBoosting f1 mean is : {gbc_f1.mean()}')

RandomForest accuracy mean is : 0.8489095281532465
RandomForest f1 mean is : 0.6674716554602472
GradienBoosting accuracy mean is : 0.862177626668878
GradientBoosting f1 mean is : 0.6833578570282954


18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [52]:
# from testing performed it seems that better results are achieved with 
# GadientBoostingClassifier .
# 
# 
gbc.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [57]:
print( f'GradienBoosting accuracy mean is : {gbc_acc.mean()}')
print( f'GradientBoosting f1 mean is : {gbc_f1.mean()}')

GradienBoosting accuracy mean is : 0.862177626668878
GradientBoosting f1 mean is : 0.6833578570282954
