In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score


#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

Загрузим данные

In [2]:
df = pd.read_csv("C:/Users/я/Desktop/ml.bis/data_train.csv")
df.head(5)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [4]:
df.dropna(inplace=True)

Разделим данные на train/test и сохраним тестовую выборку

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Credit Default' ], axis = 1), df['Credit Default'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

Pipeline

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key, drop_first=True).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key, drop_first=True)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
class CatNaNInputer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        
        self.frequent_constant = 'None'

    def fit(self, X, y=None):
        self.frequent_constant = X.mode()[0]
        return self

    def transform(self, X):
        X = X.fillna(self.frequent_constant)
        return X

In [7]:
continuous_columns = X_train.select_dtypes(include='number').columns.to_list()
categorical_columns = X_train.select_dtypes(exclude='number').columns.to_list()

Соберем кусок, ответственный за feature engineering

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('nan_inputer', CatNaNInputer()),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                
                ('NAN', SimpleImputer(strategy='median')),
            ])
    
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

In [30]:
final_transformers

[('Home Ownership',
  Pipeline(steps=[('selector', FeatureSelector(column='Home Ownership')),
                  ('nan_inputer', CatNaNInputer()),
                  ('ohe', OHEEncoder(key='Home Ownership'))])),
 ('Years in current job',
  Pipeline(steps=[('selector', FeatureSelector(column='Years in current job')),
                  ('nan_inputer', CatNaNInputer()),
                  ('ohe', OHEEncoder(key='Years in current job'))])),
 ('Purpose',
  Pipeline(steps=[('selector', FeatureSelector(column='Purpose')),
                  ('nan_inputer', CatNaNInputer()),
                  ('ohe', OHEEncoder(key='Purpose'))])),
 ('Term',
  Pipeline(steps=[('selector', FeatureSelector(column='Term')),
                  ('nan_inputer', CatNaNInputer()),
                  ('ohe', OHEEncoder(key='Term'))])),
 ('Annual Income',
  Pipeline(steps=[('selector', NumberSelector(key='Annual Income')),
                  ('NAN', SimpleImputer(strategy='median'))])),
 ('Tax Liens',
  Pipeline(steps=[('select

Добавим классификатор

In [9]:
%%time
model = RandomForestClassifier(random_state=42, class_weight = 'balanced_subsample',
                               max_depth = 4, min_samples_leaf = 10,
                               min_samples_split = 2, n_estimators = 700  )
pipeline = Pipeline([
    ('features', feats),
    ('classifier', model),
])

pipeline.fit(X_train, y_train)

CPU times: total: 4.62 s
Wall time: 4.71 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Home Ownership',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Home '
                                                                                         'Ownership')),
                                                                 ('nan_inputer',
                                                                  CatNaNInputer()),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Home '
                                                                                 'Ownership'))])),
                                                ('Years in current job',
                                                 Pipeline(steps=[('selector',
                                                

Сохраним модель

In [10]:
with open("rf_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

Проверка работоспособности и качества пайплайна

In [11]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [13]:
X_test.head(3)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,658217.0,10+ years,0.0,9.0,10.9,150260.0,0.0,3.0,0.0,debt consolidation,Short Term,139370.0,53200.0,10970.0,733.0
1,Home Mortgage,1028242.0,7 years,0.0,6.0,13.9,123596.0,0.0,43.0,0.0,debt consolidation,Short Term,99999999.0,56791.0,9254.0,734.0
2,Rent,1045323.0,10+ years,0.0,4.0,16.3,299090.0,0.0,33.0,0.0,debt consolidation,Long Term,440132.0,181070.0,11847.0,732.0


In [14]:
with open('rf_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [15]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Home Ownership',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Home '
                                                                                         'Ownership')),
                                                                 ('nan_inputer',
                                                                  CatNaNInputer()),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Home '
                                                                                 'Ownership'))])),
                                                ('Years in current job',
                                                 Pipeline(steps=[('selector',
                                                

In [16]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [17]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.48399523069063677, F-Score=0.585, Precision=0.514, Recall=0.678


In [18]:
# Загружаем обученные модели
with open('rf_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [41]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

Создаем сервис для обработки запросов к модели

In [19]:
from flask import Flask, request, jsonify

In [22]:
# Обработчики и запуск Flask
app = Flask(__name__)



@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    (home_ownership, annual_income, years_job, tax_liens, accounts, years_history, max_credit, n_problems, last_delinquent, bankruptcies, purpose, 
    term, current_loan_amount, current_credit_balance, monthly_debt, credit_score) = [np.nan]*16

    request_json = request.get_json()
    
    if request_json["Home Ownership"]:
        home_ownership = request_json['Home Ownership']
    
    if request_json["Annual Income"]:
        annual_income = request_json['Annual Income']
                
    if request_json["Years in current job"]:
        years_job = request_json['Years in current job']

    if request_json["Tax Liens"]:
        tax_liens = request_json['Tax Liens']
    
    if request_json["Number of Open Accounts"]:
        accounts = request_json['Number of Open Accounts']
                
    if request_json["Years of Credit History"]:
        years_history = request_json['Years of Credit History']

    if request_json["Maximum Open Credit"]:
        max_credit = request_json['Maximum Open Credit']
        
    if request_json["Number of Credit Problems"]:
        n_problems = request_json['Number of Credit Problems']
                
    if request_json["Months since last delinquent"]:
        last_delinquent = request_json['Months since last delinquent']

    if request_json["Bankruptcies"]:
        bankruptcies = request_json['Bankruptcies']
    
    if request_json["Purpose"]:
        purpose = request_json['Purpose']
                
    if request_json["Term"]:
        term = request_json['Term']

    if request_json["Current Loan Amount"]:
        current_loan_amount = request_json['Current Loan Amount']
    
    if request_json["Current Credit Balance"]:
        current_credit_balance = request_json['Current Credit Balance']
                
    if request_json["Monthly Debt"]:
        monthly_debt = request_json['Monthly Debt']

    if request_json["Credit Score"]:
        credit_score = request_json['Credit Score']
        
    preds = model.predict_proba(pd.DataFrame({'Home Ownership' : [home_ownership],
                                              'Annual Income' : [annual_income],
                                              'Years in current job' : [years_job],
                                              'Tax Liens' : [tax_liens],
                                              'Number of Open Accounts' : [accounts],
                                              'Years of Credit History' : [years_history],
                                              'Maximum Open Credit' : [max_credit],
                                              'Number of Credit Problems' : [n_problems],
                                              'Months since last delinquent' : [last_delinquent],
                                              'Bankruptcies' : [bankruptcies],
                                              'Purpose' : [purpose],
                                              'Term' : [term],
                                              'Current Loan Amount' : [current_loan_amount] ,
                                              'Current Credit Balance' : [current_credit_balance],
                                              'Monthly Debt' : [monthly_debt],
                                              'Credit Score' : [credit_score]}))
      
   
    data["predictions"] = preds[:, 1][0]
    data["description"] = request_json

    data["success"] = True
    print('OK')


    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [17/Jun/2022 16:25:48] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:25:58] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:12] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:12] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:13] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:13] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:13] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:13] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:14] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:14] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:14] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:15] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:15] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:15] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:16] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:16] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:16] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:17] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:17] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:17] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:18] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:18] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:19] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:19] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:19] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:19] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:20] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:20] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:20] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:21] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:21] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:21] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:21] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:22] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:22] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:22] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:23] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:23] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:23] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:24] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:24] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:24] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:24] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:25] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:25] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:25] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:26] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:26] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:26] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:27] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:27] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:26:27] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:36:55] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:30] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:30] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:30] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:31] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:31] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:32] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:32] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:32] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:32] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:33] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:33] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:33] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:34] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:34] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:34] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:35] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:35] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:35] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:36] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:36] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:36] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:37] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:37] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:37] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:38] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:38] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:39] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:39] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:39] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [17/Jun/2022 16:38:40] "POST /predict HTTP/1.1" 200 -


OK
