In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/credit_scoring_sample.csv', sep=";")

In [3]:
def impute_nan_with_median(table):
    for col in table.columns:
        table[col]= table[col].fillna(table[col].median())
    return table  
independent_columns_names = data.columns.values
independent_columns_names = [x for x in data if x != 'SeriousDlqin2yrs']
table = impute_nan_with_median(data)
X = table[independent_columns_names]
y = table['SeriousDlqin2yrs']

In [4]:
class RandomForestClassifierCustom(BaseEstimator):
    
    def __init__(self, n_estimators=10, max_depth=7, max_features=6, random_state=17):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        
        self.trees = []
        self.feat_ids_by_tree = []
        self.ID = []
        
    def fit(self, X, y):
        for i in range(self.n_estimators):            
            np.random.seed(self.random_state + i)            
            fibt_temp = np.random.choice(X.columns, self.max_features, replace=False) 
            IDs = np.random.choice(len(X), len(X), replace=True)
            sam = X.iloc[IDs]
            sample = sam[fibt_temp]
            #sample = X.loc[IDs, fibt_temp]
            y_sample = y.iloc[IDs]

            tree = DecisionTreeClassifier(max_depth=self.max_depth,\
                                          max_features=self.max_features,\
                                          random_state=self.random_state) #,\
                                          #class_weight='balanced')
            tree.fit(sample, y_sample)            
            self.trees.append(tree)
            self.feat_ids_by_tree.append(fibt_temp)
            self.ID.append(IDs)
        return self
       
    def predict_proba(self, X):
        
        predict = []       
        for i in range(self.n_estimators):
            predict.append(self.trees[i].predict_proba(X[self.feat_ids_by_tree[i]]))          
        return np.mean(predict, axis=0)

In [5]:
# МОЙ СОБСТВЕННЫЙ КЛАСС
rfccece = RandomForestClassifierCustom(n_estimators=10, max_depth=7, max_features=6, random_state=17)
rfccece.fit(X, y)
rfccece.predict_proba(X)

array([[0.94940458, 0.05059542],
       [0.92451586, 0.07548414],
       [0.8618126 , 0.1381874 ],
       ...,
       [0.80312241, 0.19687759],
       [0.6866675 , 0.3133325 ],
       [0.45023884, 0.54976116]])

In [6]:
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
cross_val_score(rfccece,X , y, scoring='roc_auc', cv=skf2).mean()

0.832075411181175

In [7]:
rfccece.predict_proba(X).mean(axis=0)

array([0.77773373, 0.22226627])

In [8]:
# ДЛЯ ДЕРЕВА РЕШЕНИЙ
dtc = DecisionTreeClassifier(max_depth=7, max_features=6, random_state=17)
dtc.fit(X, y)
dtc.predict_proba(X)
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#cross_val_score(dtc, X, y, scoring='roc_auc', cv=3).mean()

array([[0.95078097, 0.04921903],
       [0.93234672, 0.06765328],
       [0.81204013, 0.18795987],
       ...,
       [0.87534349, 0.12465651],
       [0.75403226, 0.24596774],
       [0.53545232, 0.46454768]])

In [9]:
dtc.predict_proba(X).mean(axis=0)

array([0.77751148, 0.22248852])

In [10]:
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
cross_val_score(dtc, X, y, scoring='roc_auc', cv=skf2).mean()

0.8199583779477086

In [11]:
# ДЛЯ СЛУЧАЙНОГО ЛЕСА
rf = RandomForestClassifier(n_estimators=10, max_depth=7, max_features=6, random_state=17)
rf.fit(X, y)
rf.predict_proba(X)
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#cross_val_score(rf, X, y, scoring='roc_auc', cv=3).mean()

array([[0.9550809 , 0.0449191 ],
       [0.94848111, 0.05151889],
       [0.86879372, 0.13120628],
       ...,
       [0.8211826 , 0.1788174 ],
       [0.70466953, 0.29533047],
       [0.44293808, 0.55706192]])

In [12]:
rf.predict_proba(X).mean(axis=0)

array([0.77785283, 0.22214717])

In [13]:
skf3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
cross_val_score(rf, X, y, scoring='roc_auc', cv=skf3).mean()

0.8313944291645278