In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, f1_score, accuracy_score, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


import warnings
warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True)

In [225]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [226]:
df = pd.read_csv('boston.csv')
df
X = df.drop(columns='MEDV')
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state= RANDOM_STATE)

2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [227]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_lr = r2_score(y_test,y_pred)

rid = Ridge()
rid.fit(X_train, y_train)
y_pred = rid.predict(X_test)
r2_ridge = r2_score(y_test,y_pred)

las = Lasso()
las.fit(X_train, y_train)
y_pred = las.predict(X_test)
r2_lasso = r2_score(y_test,y_pred)


### Ваш код ###


results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]

3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [228]:
# ridge GridSearchCV
parameters = {'alpha': [10**i for i in range(-5, 6)]}
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, parameters, scoring ='r2', cv = 5)
ridge_grid.fit(X_train,y_train)
ridge_best = ridge_grid.best_params_['alpha']
ridge_total = Ridge(alpha = ridge_best)
ridge_total.fit(X_train,y_train)
y_pred = ridge_total.predict(X_test)
r2_ridge_grid_search = r2_score(y_test, y_pred)

# lasso 
parameters = {'alpha': [10**i for i in range(-5, 6)]}
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, parameters, scoring ='r2', cv = 5)
lasso_grid.fit(X_train,y_train)
lasso_best = lasso_grid.best_params_['alpha']
lasso_total = Lasso(alpha = lasso_best)
lasso_total.fit(X_train,y_train)
y_pred = lasso_total.predict(X_test)
r2_lasso_grid_search = r2_score(y_test, y_pred)

#RidgeCV
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_ridge_cv = r2_score(y_test, y_pred)

#LassoCV
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_lasso_cv = r2_score(y_test, y_pred)


results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]

results_regression


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483


4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [229]:
ridge_pipeline_standard = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Создание пайплайна с MinMaxScaler и моделью Ridge
ridge_pipeline_minmax = Pipeline([
    ('scaler', MinMaxScaler()),
    ('ridge', Ridge())
])

# Создание пайплайна с StandardScaler и моделью Lasso
lasso_pipeline_standard = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())
])

# Создание пайплайна с MinMaxScaler и моделью Lasso
lasso_pipeline_minmax = Pipeline([
    ('scaler', MinMaxScaler()),
    ('lasso', Lasso())
])


# Ridge с StandardScaler
ridge_pipeline_standard.fit(X_train, y_train)
ridge_standard_pred = ridge_pipeline_standard.predict(X_test)
r2_ridge_standart_scaler = r2_score(y_test, ridge_standard_pred)

# Ridge с MinMaxScaler
ridge_pipeline_minmax.fit(X_train, y_train)
ridge_minmax_pred = ridge_pipeline_minmax.predict(X_test)
r2_ridge_min_max_scaler = r2_score(y_test, ridge_minmax_pred)

# Lasso с StandardScaler
lasso_pipeline_standard.fit(X_train, y_train)
lasso_standard_pred = lasso_pipeline_standard.predict(X_test)
r2_lasso_standart_scaler = r2_score(y_test, lasso_standard_pred)

# Lasso с MinMaxScaler
lasso_pipeline_minmax.fit(X_train, y_train)
lasso_minmax_pred = lasso_pipeline_minmax.predict(X_test)
r2_lasso_min_max_scaler = r2_score(y_test, lasso_minmax_pred)

results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [230]:
mms = MinMaxScaler()
mms.fit(X_train)

X_train_norm = pd.DataFrame(mms.transform(X_train), columns = X_train.columns)
X_test_norm = pd.DataFrame(mms.transform(X_test), columns = X_train.columns)

scaler = StandardScaler()
scaler.fit(X_train)

X_train_standart = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_standart = pd.DataFrame(scaler.transform(X_test), columns = X_train.columns)


#RidgeCV MinMax
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(X_train_norm,y_train)
y_pred = model.predict(X_test_norm)
r2_ridge_min_max_scaler_cv = r2_score(y_test, y_pred)

#RidgeCV Standart
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(X_train_standart,y_train)
y_pred = model.predict(X_test_standart)
r2_ridge_standart_scaler_cv = r2_score(y_test, y_pred)


#LassoCV MinMax
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(X_train_norm,y_train)
y_pred = model.predict(X_test_norm)
r2_lasso_min_max_scaler_cv = r2_score(y_test, y_pred)

#LassoCV Standart
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(X_train_standart,y_train)
y_pred = model.predict(X_test_standart)
r2_lasso_standart_scaler_cv = r2_score(y_test, y_pred)

results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]

results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [231]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

model = Ridge()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_ridge_standart_scaler_poly = r2_score(y_test, y_pred)

model = Lasso()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_lasso_standart_scaler_poly = r2_score(y_test, y_pred)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

model = Ridge()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_ridge_min_max_scaler_poly = r2_score(y_test, y_pred)

model = Lasso()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_lasso_min_max_scaler_poly = r2_score(y_test, y_pred)

results_regression.loc[15] = ['Ridge_StandardScaler_Poly', 'task6', r2_ridge_standart_scaler_poly]
results_regression.loc[16] = ['Ridge_MinMaxScaler_Poly', 'task6', r2_ridge_min_max_scaler_poly]
results_regression.loc[17] = ['Lasso_StandardScaler_Poly', 'task6', r2_lasso_standart_scaler_poly]
results_regression.loc[18] = ['Lasso_MinMaxScaler_Poly', 'task6', r2_lasso_min_max_scaler_poly]

In [232]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [233]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# ridge standart
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_ridge_standart_scaler_poly_cv = r2_score(y_test, y_pred)

# Lasso standart
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_lasso_standart_scaler_poly_cv = r2_score(y_test, y_pred)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Ridge minmax
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_ridge_min_max_scaler_poly_cv = r2_score(y_test, y_pred)

# Lasso minmax

parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
r2_lasso_min_max_scaler_poly_cv = r2_score(y_test, y_pred)

### Ваш код ###

results_regression.loc[19] = ['Ridge_StandardScaler_Poly_CV', 'task7', r2_ridge_standart_scaler_poly_cv]
results_regression.loc[20] = ['Ridge_MinMaxScaler_Poly_CV', 'task7', r2_ridge_min_max_scaler_poly_cv]
results_regression.loc[21] = ['Lasso_StandardScaler_Poly_CV', 'task7', r2_lasso_standart_scaler_poly_cv]
results_regression.loc[22] = ['Lasso_MinMaxScaler_Poly_CV', 'task7', r2_lasso_min_max_scaler_poly_cv]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

In [234]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

# Создание пайплайна
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Метод масштабирования
    ('poly', PolynomialFeatures()),  # Полиномиальные признаки
    ('regression', Ridge())  # Модель Ridge по умолчанию
])

# Параметры для поиска
parameters = {
    'scaler': [StandardScaler(), MinMaxScaler()],  # Метод масштабирования
    'poly__degree': [1, 2],  # Степень полинома
    'regression': [Ridge(), Lasso()],  # Тип регуляризации (L1 - Lasso, L2 - Ridge)
    'regression__alpha': [10**i for i in range(-5, 6)],  # Коэффициент регуляризации
}


grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='r2')


grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = grid_search.best_score_
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]
results_regression


KeyboardInterrupt: 

In [None]:


### Ваш код ###

best_params = grid_search.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = grid_search.best_score_
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]
results_regression

Параметры лучшей модели:
 {'regression': Ridge(alpha=0.1), 'regression__alpha': 0.1, 'scaler': MinMaxScaler()}


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


In [None]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


http://archive.ics.uci.edu/ml/datasets/Adult

In [None]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [None]:


### Ваш код ###

class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns='class')
y = pd.DataFrame(df['class'])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [None]:

most = Y.mode()[0]
y_pred_most = [most] * len(Y)
acc_most_frequent = accuracy_score(Y, y_pred_most)
f1_most_frequent = f1_score(Y, y_pred_most)
### Ваш код ###

# f1_most_frequent = 0
# acc_most_frequent = 0
results_classification.loc[0] = ['Most Frequent class', 'task10', f1_most_frequent, acc_most_frequent]

11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [None]:
X.isnull().sum()

### Ваш код ###

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).


In [None]:
### Ваш код ###

numeric_columns = X.select_dtypes(include=['number']).columns.tolist()

categorical_columns = X.select_dtypes(include=['object']).columns.tolist()


13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [None]:


### Ваш код ###
df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns='class')
y = pd.DataFrame(df['class'])

numeric_columns = X.select_dtypes(include=['number']).columns.tolist()

categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# пайплайн для обработки числовых значений
def pipe (X,y,numeric_columns,categorical_columns):
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])

    #  пайплайн для обработки категориальных значений
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #  оба пайплайна в ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    #модели для оценки
    models = [
        ('Logistic Regression', LogisticRegression()),
        ('KNN', KNeighborsClassifier()),
        ('Linear SVC', LinearSVC())
    ]


    for name, model in models:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        accuracy_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        f1_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
        
        
        print(f"Модель: {name}")
        print(f"accuracy: {accuracy_scores.mean()}")
        print(f"F1: {f1_scores.mean()}")
        print()

pipe(X, y, numeric_columns, categorical_columns)

f1_LR = 0.905011767101741
acc_LR = 0.8511732716999377
f1_KNN = 0.8869810221624912
acc_KNN = 0.8247614719848316
f1_SVM = 0.9063223083526415
acc_SVM = 0.8529135478362626
results_classification.loc[1] = ['LogisticRegression', 'task13', f1_LR, acc_LR]
results_classification.loc[2] = ['KNeighborsClassifier', 'task13', f1_KNN, acc_KNN]
results_classification.loc[3] = ['LinearSVC', 'task13', f1_SVM, acc_SVM]

Модель: Logistic Regression
accuracy: 0.8511732716999377
F1: 0.905011767101741

Модель: KNN
accuracy: 0.8247614719848316
F1: 0.8869810221624912

Модель: Linear SVC
accuracy: 0.8529135478362626
F1: 0.9063223083526415



14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [None]:

df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns = 'class')
y = df['class']
X.replace('?', np.nan, inplace = True)

simp = SimpleImputer(strategy='most_frequent')
X_transform = pd.DataFrame(simp.fit_transform(X))


numeric_columns = X_transform.select_dtypes(include=['number']).columns.tolist()

categorical_columns = X_transform.select_dtypes(include=['object']).columns.tolist()

pipe(X_transform, y, numeric_columns, categorical_columns )

Модель: Logistic Regression
accuracy: 0.8700299649168521
F1: 0.9169237036324105

Модель: KNN
accuracy: 0.8242292241524508
F1: 0.8875246053044942

Модель: Linear SVC
accuracy: 0.8562098228287193
F1: 0.9071515566729869



15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

In [None]:
df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns = 'class')
y = df['class']
X.replace('?', np.nan).dropna()

pipe(X, y, numeric_columns, categorical_columns )

### Ваш код ###

f1_LR_del_missings = 0
acc_LR_del_missings = 0
f1_KNN_del_missings = 0
acc_KNN_del_missings = 0
f1_SVM_del_missings = 0
acc_SVM_del_missings = 0
results_classification.loc[7] = ['LogisticRegression_delete_missings', 'task15', f1_LR_del_missings, acc_LR_del_missings]
results_classification.loc[8] = ['KNeighborsClassifier_delete_missings', 'task15', f1_KNN_del_missings, acc_KNN_del_missings]
results_classification.loc[9] = ['LinearSVC_delete_missings', 'task15', f1_SVM_del_missings, acc_SVM_del_missings]

Модель: Logistic Regression
accuracy: 0.8706851404210397
F1: 0.917352835031414

Модель: KNN
accuracy: 0.8247615746848013
F1: 0.8878952058958145

Модель: Linear SVC
accuracy: 0.8564759897112237
F1: 0.907260120667708



 16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

 

In [239]:
# пайплайн для обработки числовых значений
df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns = 'class')
y = df['class']
X.replace('?', np.nan, inplace = True)

simp = SimpleImputer(strategy='most_frequent')
X_transform = pd.DataFrame(simp.fit_transform(X))

def pipe_1 (X,y,numeric_columns,categorical_columns):
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])

    #  пайплайн для обработки категориальных значений
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #  оба пайплайна в ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    #модели для оценки
    models = [
        ('Forest', RandomForestClassifier()),
        ('GradientBoosting', GradientBoostingClassifier())
        ]


    for name, model in models:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        accuracy_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        f1_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
        
        
        print(f"Модель: {name}")
        print(f"accuracy: {accuracy_scores.mean()}")
        print(f"F1: {f1_scores.mean()}")
        print()

pipe_1(X_transform, y, numeric_columns, categorical_columns)

In [236]:
df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns = 'class')
y = df['class']
X.replace('?', np.nan, inplace = True)

simp = SimpleImputer(strategy='most_frequent')
X_transform = pd.DataFrame(simp.fit_transform(X))

model = GradientBoostingClassifier()
model.fit()

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

df = pd.read_csv('adult.csv')
class_map = {'<=50K': 1, '>50K': 0}
df['class'] = df['class'].map(class_map)

X = df.drop(columns = 'class')
y = df['class']
X.replace('?', np.nan, inplace = True)

simp = SimpleImputer(strategy='most_frequent')
X_transform = pd.DataFrame(simp.fit_transform(X))

# Определение модели RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Определение модели GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()

# Оцениваем производительность моделей с использованием кросс-валидации
rf_accuracy_scores = cross_val_score(rf_classifier, X_transform, y, cv=5, scoring='accuracy')
rf_f1_scores = cross_val_score(rf_classifier, X_transform, y, cv=5, scoring='f1')

gb_accuracy_scores = cross_val_score(gb_classifier, X_transform, y, cv=5, scoring='accuracy')
gb_f1_scores = cross_val_score(gb_classifier, X_transform, y, cv=5, scoring='f1')

print("RandomForestClassifier:")
print(f"Средняя accuracy: {rf_accuracy_scores.mean():.4f}")
print(f"Средняя F1-мера: {rf_f1_scores.mean():.4f}")
print("----------")
print("GradientBoostingClassifier:")
print(f"Средняя accuracy: {gb_accuracy_scores.mean():.4f}")
print(f"Средняя F1-мера: {gb_f1_scores.mean():.4f}")


### Ваш код ###

f1_RF = 0
acc_RF = 0
f1_GB = 0
acc_GB = 0
results_classification.loc[10] = ['RandomForestClassifier', 'task16', f1_RF, acc_RF]
results_classification.loc[11] = ['GradientBoostingClassifier', 'task16', f1_GB, acc_GB]

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\anaconda\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\sklearn\ensemble\_forest.py", line 348, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\Lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'State-gov'


17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [None]:


### Ваш код ###

best_params = {}
print('Параметры лучшей модели:\n', best_params)
f1_best = 0
acc_best = 0
results_classification.loc[12] = ['Best_Model', 'task17', f1_best, acc_best]

In [None]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.905012,0.851173
2,KNeighborsClassifier,task13,0.886981,0.824761
3,LinearSVC,task13,0.906322,0.852914
6,LinearSVC_impute,task14,0.906322,0.852914
