In [18]:
import yaml
import pandas as pd
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
with open('config.yaml', 'r') as f:
    cfg = yaml.safe_load(f)
    
train_df = pd.read_csv(cfg['selary']['train_refactor'])
test_df = pd.read_csv(cfg['selary']['test_refactor'])

In [9]:
#train_df = train_df.drop(['Unnamed: 0'], axis =1)
#test_df = test_df.drop(['Unnamed: 0'], axis =1)
train_df

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0,25.0,0,112882.680,9406.890000,5,1,10,4.0,981,...,1,722.90,40.090839,347.0,1,353.616124,166.446410,1,688.426466,0
1,1,28.0,10,20975.400,1747.950000,2,3,9,1.0,0,...,1,731.40,26.205042,291.0,0,16.196792,67.158201,2,361.040007,0
2,5,36.0,10,31751.830,2645.985833,5,4,18,3.0,5750,...,2,110.73,24.301922,289.0,2,58.006937,120.397883,0,369.093763,2
3,4,30.0,5,16568.425,1380.702083,5,3,18,1.0,4144,...,2,478.11,40.913914,352.0,1,8.474086,23.761873,2,372.834250,2
4,4,48.0,14,32228.670,2685.722500,7,6,8,4.0,13,...,2,940.36,28.589412,392.0,0,176.204872,145.824991,3,327.861512,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37556,7,29.0,8,62827.470,5235.622500,4,7,16,1.0,684,...,2,1031.09,33.472922,210.0,0,39.065197,616.958135,6,178.138918,2
37557,6,46.0,5,33647.340,2803.945000,8,5,9,4.0,3850,...,2,1152.93,29.109339,288.0,1,102.557556,366.987038,6,100.949906,2
37558,1,39.0,9,40908.160,3409.013333,0,7,10,0.0,3463,...,1,1210.40,27.931534,209.0,1,0.000000,295.075706,4,302.225628,2
37559,6,43.0,0,20799.590,1733.299167,2,5,7,0.0,3463,...,1,33.25,27.192416,244.0,1,0.000000,127.512473,6,306.017444,0


In [6]:
X_train = train_df.drop(['Credit_Score'], axis=1)
y_train = train_df['Credit_Score']
X_test = test_df.drop(['Credit_Score'], axis=1)
y_test = test_df['Credit_Score']


In [8]:
print(f"Размеры X_train и y_train: {X_train.shape}, {y_train.shape}")
print(f"Размеры X_test и y_test: {X_test.shape}, {y_test.shape}")

Размеры X_train и y_train: (37561, 23), (37561,)
Размеры X_test и y_test: (9381, 23), (9381,)


In [62]:
modelBaggingClass = BaggingClassifier(estimator=LogisticRegression(max_iter=1000), n_estimators=50, random_state=12)
modelBaggingClass.fit(X_train, y_train)

In [81]:
modelBoostingClass = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=5, min_samples_leaf=110), n_estimators=25)
modelBoostingClass.fit(X_train, y_train)

In [21]:
modelGradBoost = GradientBoostingClassifier(max_depth=13, n_estimators=250, learning_rate=1, min_samples_leaf=1, max_features=13)
modelGradBoost.fit(X_train, y_train)

In [8]:
params = { 'max_depth': range(10,20), 'max_features': range(10,20), 'min_samples_leaf': range(1,5)}

In [9]:
from sklearn.model_selection import GridSearchCV
Grad_grid = GridSearchCV(GradientBoostingClassifier(), params, cv=10, verbose=True, n_jobs=4)

In [10]:
Grad_grid.fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


In [11]:
Grad_grid.best_params_, Grad_grid.best_score_

({'max_depth': 13, 'max_features': 13, 'min_samples_leaf': 1},
 0.8007509872235653)

In [15]:
print(modelGradBoost.score(X_test, y_test))

0.7916000426393774


In [100]:
pipe = make_pipeline(StandardScaler(), modelGradBoost)
pipe.fit(X_train, y_train)


In [101]:
pipe.score(X_test, y_test)

0.7966101694915254

In [104]:
y_pred = modelGradBoost.predict(X_test)

#средняя абсолютная ошибка
mae = mean_absolute_error(y_test, y_pred)

#среднеквадратичная ошибка
mse = mean_squared_error(y_test, y_pred)

#R^2 score (коэффициент детерминации)
r2 = r2_score(y_test, y_pred)
print(modelGradBoost.score(X_test, y_test))
print("MAE:", mae)
print("MSE:", mse)
print("R^2 score:", r2)

0.794478200618271
MAE: 0.28515083679778275
MSE: 0.4444089116298902
R^2 score: 0.23081864595533685
