Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import dill
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

Загрузка данных

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=26)

X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [4]:
#обработчики данных
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key
        self.balance = []

    def fit(self, X, y=None):
        if self.key == 'Balance':
            balance_data=list(X['Balance'].unique())
            balance_data.remove(0.0)
            self.balance = balance_data
        return self

    def transform(self, X):
        if self.key == 'Balance':
            X['Balance'] = X['Balance'].apply(lambda x: x if x!=0.0 else random.choice(self.balance))
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['Balance', 'NumOfProducts', 'EstimatedSalary','CreditScore', 'Age']

In [6]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [7]:
final_transformers

[('Geography',
  Pipeline(steps=[('selector', FeatureSelector(column='Geography')),
                  ('ohe', OHEEncoder(key='Geography'))])),
 ('Gender',
  Pipeline(steps=[('selector', FeatureSelector(column='Gender')),
                  ('ohe', OHEEncoder(key='Gender'))])),
 ('Tenure',
  Pipeline(steps=[('selector', FeatureSelector(column='Tenure')),
                  ('ohe', OHEEncoder(key='Tenure'))])),
 ('HasCrCard',
  Pipeline(steps=[('selector', FeatureSelector(column='HasCrCard')),
                  ('ohe', OHEEncoder(key='HasCrCard'))])),
 ('IsActiveMember',
  Pipeline(steps=[('selector', FeatureSelector(column='IsActiveMember')),
                  ('ohe', OHEEncoder(key='IsActiveMember'))])),
 ('Balance', Pipeline(steps=[('selector', NumberSelector(key='Balance'))])),
 ('NumOfProducts',
  Pipeline(steps=[('selector', NumberSelector(key='NumOfProducts'))])),
 ('EstimatedSalary',
  Pipeline(steps=[('selector', NumberSelector(key='EstimatedSalary'))])),
 ('CreditScore',
  Pipeli

In [8]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [9]:
pipelineGB = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(random_state = 26,
                                              max_depth=4, max_features=0.3, min_samples_leaf=3, min_samples_split=2)),
])
pipelineGB.fit(X_train, y_train)
preds = pipelineGB.predict_proba(X_test)[:, 1]

In [10]:
pipelineGB.steps

[('features',
  FeatureUnion(transformer_list=[('Geography',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Geography')),
                                                  ('ohe',
                                                   OHEEncoder(key='Geography'))])),
                                 ('Gender',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Gender')),
                                                  ('ohe',
                                                   OHEEncoder(key='Gender'))])),
                                 ('Tenure',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Tenure')),
                                                  ('ohe',
                                                   OHEEncoder(

In [11]:
#сохраняем пайплайн
with open('gradboost_pipeline.dill', 'wb') as f:
    dill.dump(pipelineGB, f)

In [12]:
#подгружаем данные и модель
X_test = pd.read_csv("X_test.csv")
X_train = pd.read_csv("y_test.csv")

In [13]:
X_test.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,8885,15749583,Bellucci,686,Germany,Female,38,2,93569.86,3,0,0,10137.34,1
1,4630,15687153,Graham,850,Germany,Male,49,8,98649.55,1,1,0,119174.88,1
2,727,15673570,Olsen,580,France,Male,37,9,0.0,2,0,1,77108.66,0
3,3891,15719579,McIntosh,670,Germany,Female,33,9,84521.48,2,0,1,198017.05,0
4,712,15650288,Summers,634,Germany,Male,35,6,116269.01,1,1,0,129964.94,0


In [14]:
with open('gradboost_pipeline.dill', 'rb') as f:
    pipeline = dill.load(f)

In [15]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [16]:
#тестируем пайплайн
preds = pipeline.predict_proba(X_test)[:,1]

pred_df = pd.DataFrame({'preds':preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [17]:
preds[:10]

array([0.74108133, 0.69898457, 0.02166155, 0.02004377, 0.35596888,
       0.41686125, 0.26229759, 0.10847401, 0.36091553, 0.41793982])

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3732355037097012, F-Score=0.627, Precision=0.655, Recall=0.602


In [19]:
from flask import Flask, request, jsonify

In [20]:
#подгружаем модель и данные
with open('gradboost_pipeline.dill', 'rb') as f:
    model = dill.load(f)

In [21]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv('y_test.csv')

In [None]:
#запускаем сервер
app = Flask(__name__)

@app.route("/", methods=['GET'])
def general():
    return "Welcome to prediction process"

@app.route("/predict", methods=["POST"])
def predict():
    data = {'success': False}
    
    request_json = request.get_json()

    geography, gender = '',''
    balance,  estimatedsalary = 0.0, 0.0
    creditscore, age, tenure,numofproducts,hascrcard, isactivemember = 0,0,0,0,0,0
    if request_json['Geography']:
        geography = request_json['Geography']
    if request_json['Gender']:
        gender = request_json['Gender']
    if request_json['Tenure']:
        tenure = request_json['Tenure']
    if request_json['HasCrCard']:
        hascrcard = request_json['HasCrCard']
    if request_json['IsActiveMember']:
        isactivemember = request_json['IsActiveMember']
    if request_json['CreditScore']:
        creditscore = request_json['CreditScore']
    if request_json['Age']:
        age = request_json['Age']
    if request_json['Balance']:
        balance = request_json['Balance']
    if request_json['NumOfProducts']:
        numofproducts = request_json['NumOfProducts']
    if request_json['EstimatedSalary']:
        estimatedsalary = request_json['EstimatedSalary']
    
    preds = model.predict_proba(pd.DataFrame({'Geography': [geography],
                                            'Gender': [gender],
                                            'Tenure': [tenure],
                                            'HasCrCard': [hascrcard],
                                            'IsActiveMember': [isactivemember],
                                            'CreditScore': [creditscore],
                                            'Age': [age],
                                            'Balance': [balance],
                                            'NumOfProducts': [numofproducts],
                                            'EstimatedSalary': [estimatedsalary]}))
    data['predictions'] = preds[:, 1][0]
    data['success'] = True
    print('OK')
    
    return jsonify(data)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Feb/2023 22:13:08] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [20/Feb/2023 22:13:11] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [20/Feb/2023 22:13:15] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:15] "POST /predict HTTP/1.1" 200 -


OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK
OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:19] "POST /predict HTTP/1.1" 200 -


OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK
OK
OK
OK

127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -



OK
OK
OK
OK
OK
OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK
OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK


127.0.0.1 - - [20/Feb/2023 22:13:42] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -


OK
OK
OK
OK
OK
OK
OK
OK

127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2023 22:13:43] "POST /predict HTTP/1.1" 200 -



OK
OK
OK
OK
OK
