1. Для нашего пайплайна (Case1) поэкспериментировать с разными моделями: 1 - бустинг, 2 - логистическая регрессия (не забудьте здесь добавить в cont_transformer стандартизацию - нормирование вещественных признаков)


In [1]:
import pandas as pd
import numpy as np
import dill
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=26)

X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key
        self.balance = []

    def fit(self, X, y=None):
        if self.key == 'Balance':
            balance_data=list(X['Balance'].unique())
            balance_data.remove(0.0)
            self.balance = balance_data
        return self

    def transform(self, X):
        if self.key == 'Balance':
            X['Balance'] = X['Balance'].apply(lambda x: x if x!=0.0 else random.choice(self.balance))
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['Balance', 'NumOfProducts', 'EstimatedSalary','CreditScore', 'Age']

In [6]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [7]:
final_transformers

[('Geography',
  Pipeline(steps=[('selector', FeatureSelector(column='Geography')),
                  ('ohe', OHEEncoder(key='Geography'))])),
 ('Gender',
  Pipeline(steps=[('selector', FeatureSelector(column='Gender')),
                  ('ohe', OHEEncoder(key='Gender'))])),
 ('Tenure',
  Pipeline(steps=[('selector', FeatureSelector(column='Tenure')),
                  ('ohe', OHEEncoder(key='Tenure'))])),
 ('HasCrCard',
  Pipeline(steps=[('selector', FeatureSelector(column='HasCrCard')),
                  ('ohe', OHEEncoder(key='HasCrCard'))])),
 ('IsActiveMember',
  Pipeline(steps=[('selector', FeatureSelector(column='IsActiveMember')),
                  ('ohe', OHEEncoder(key='IsActiveMember'))])),
 ('Balance', Pipeline(steps=[('selector', NumberSelector(key='Balance'))])),
 ('NumOfProducts',
  Pipeline(steps=[('selector', NumberSelector(key='NumOfProducts'))])),
 ('EstimatedSalary',
  Pipeline(steps=[('selector', NumberSelector(key='EstimatedSalary'))])),
 ('CreditScore',
  Pipeli

In [8]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [9]:
pipelineGB = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(random_state = 26,
                                              max_depth=4, max_features=0.3, min_samples_leaf=3, min_samples_split=2)),
])
pipelineGB.fit(X_train, y_train)
preds = pipelineGB.predict_proba(X_test)[:, 1]

In [10]:
pipelineGB.steps

[('features',
  FeatureUnion(transformer_list=[('Geography',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Geography')),
                                                  ('ohe',
                                                   OHEEncoder(key='Geography'))])),
                                 ('Gender',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Gender')),
                                                  ('ohe',
                                                   OHEEncoder(key='Gender'))])),
                                 ('Tenure',
                                  Pipeline(steps=[('selector',
                                                   FeatureSelector(column='Tenure')),
                                                  ('ohe',
                                                   OHEEncoder(

In [11]:
with open('gradboost_pipeline.dill', 'wb') as f:
    dill.dump(pipelineGB, f)

In [12]:
X_test = pd.read_csv("X_test.csv")
X_train = pd.read_csv("y_test.csv")

In [13]:
X_test.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,8885,15749583,Bellucci,686,Germany,Female,38,2,93569.86,3,0,0,10137.34,1
1,4630,15687153,Graham,850,Germany,Male,49,8,98649.55,1,1,0,119174.88,1
2,727,15673570,Olsen,580,France,Male,37,9,0.0,2,0,1,77108.66,0
3,3891,15719579,McIntosh,670,Germany,Female,33,9,84521.48,2,0,1,198017.05,0
4,712,15650288,Summers,634,Germany,Male,35,6,116269.01,1,1,0,129964.94,0


In [14]:
with open('gradboost_pipeline.dill', 'rb') as f:
    pipeline = dill.load(f)

In [15]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [16]:
preds = pipeline.predict_proba(X_test)[:,1]

pred_df = pd.DataFrame({'preds':preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [17]:
preds[:10]

array([0.80796085, 0.78210165, 0.0296932 , 0.02068209, 0.30586347,
       0.44818763, 0.19291307, 0.10225029, 0.39568321, 0.53273383])

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.31440712481323124, F-Score=0.631, Precision=0.622, Recall=0.641


In [19]:
from flask import Flask, request, jsonify

In [None]:
# app = Flask(__name__)
# # run(app)

# @app.route('/a')
# def hello():
#     return('Hello world')

# if __name__ == '__main__':
#     app.run()

In [20]:
with open('gradboost_pipeline.dill', 'rb') as f:
    model = dill.load(f)

In [21]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv('y_test.csv')

In [22]:
x =  (600, 'Germany', 'Male', 50, 8, 100000.0, 1, 1, 1, 100000.0)

In [23]:
creditscore = x[0]
geography = x[1]
gender = x[2]
age = x[3]
tenure = x[4]
balance = x[5]
numofproducts = x[6]
hascrcard  = x[7]
isactivemember = x[8]
estimatedsalary = x[9]

In [24]:
preds = model.predict_proba(pd.DataFrame({'Geography': [geography],
                                            'Gender': [gender],
                                            'Tenure': [tenure],
                                            'HasCrCard': [hascrcard],
                                            'IsActiveMember': [isactivemember],
                                            'CreditScore': [creditscore],
                                            'Age': [age],
                                            'Balance': [balance],
                                            'NumOfProducts': [numofproducts],
                                            'EstimatedSalary': [estimatedsalary]}))

In [25]:
preds

array([[0.5451641, 0.4548359]])

In [26]:
preds = model.predict_proba(X_test)

In [27]:
X_test

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,8885,15749583,Bellucci,686,Germany,Female,38,2,93569.86,3,0,0,10137.34,1
1,4630,15687153,Graham,850,Germany,Male,49,8,98649.55,1,1,0,119174.88,1
2,727,15673570,Olsen,580,France,Male,37,9,104016.88,2,0,1,77108.66,0
3,3891,15719579,McIntosh,670,Germany,Female,33,9,84521.48,2,0,1,198017.05,0
4,712,15650288,Summers,634,Germany,Male,35,6,116269.01,1,1,0,129964.94,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,3733,15689598,Dean,722,France,Male,46,6,107508.93,1,1,1,93917.68,1
2496,2839,15646196,Yeh,850,Spain,Female,36,2,155180.56,2,0,0,169415.54,0
2497,4464,15778975,Nnonso,850,Germany,Female,70,1,96947.58,3,1,0,62282.99,1
2498,1379,15682834,Johnstone,715,Spain,Female,35,4,40169.88,2,1,1,199857.47,0


In [28]:
preds

array([[0.19203915, 0.80796085],
       [0.21789835, 0.78210165],
       [0.97436675, 0.02563325],
       ...,
       [0.07076337, 0.92923663],
       [0.95437051, 0.04562949],
       [0.90901741, 0.09098259]])

In [None]:
app = Flask(__name__)

@app.route("/", methods=['GET'])
def general():
    return "Welcome to prediction process"

@app.route("/predict", methods=["POST"])
def predict():
    data = {'success': False}
    
    request_json = request.get_json()
    geography, gender, tenure = '','',''
    balance, numofproducts, estimatedsalary, age = "",'','', ''
    creditscore, age = '',''
    hascrcard, isactivemember = "",''
    if request_json['Geography']:
        geography = request_json['Geography']
    if request_json['Gender']:
        gender = request_json['Gender']
    if request_json['Tenure']:
        tenure = request_json['Tenure']
    if request_json['HasCrCard']:
        hascrcard = request_json['HasCrCard']
    if request_json['IsActiveMember']:
        isactivemember = request_json['IsActiveMember']
    if request_json['CreditScore']:
        creditscore = request_json['CreditScore']
    if request_json['Age']:
        age = request_json['Age']
    if request_json['Balance']:
        balance = request_json['Balance']
    if request_json['NumOfProducts']:
        numofproducts = request_json['NumOfProducts']
    if request_json['EstimatedSalary']:
        estimatedsalary = request_json['EstimatedSalary']
    
    preds = model.predict_proba(pd.DataFrame({'Geography': [geography],
                                            'Gender': [gender],
                                            'Tenure': [tenure],
                                            'HasCrCard': [hascrcard],
                                            'IsActiveMember': [isactivemember],
                                            'CreditScore': [creditscore],
                                            'Age': [age],
                                            'Balance': [balance],
                                            'NumOfProducts': [numofproducts],
                                            'EstimatedSalary': [estimatedsalary]}))
    data['predictions'] = preds[:, 1][0]
    data['success'] = True
    print('OK')
    
    return jsonify(data)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [18/Feb/2023 01:46:47] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [18/Feb/2023 01:46:51] "POST /predict HTTP/1.1" 200 -


OK


127.0.0.1 - - [18/Feb/2023 01:46:56] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [18/Feb/2023 01:46:56] "POST /predict HTTP/1.1" 200 -


OK
OK


127.0.0.1 - - [18/Feb/2023 01:47:03] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [18/Feb/2023 01:47:03] "POST /predict HTTP/1.1" 200 -
[2023-02-18 01:47:03,778] ERROR in app: Exception on /predict [POST]
Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_5440\1717074721.py", line 37, in predict
    preds = model.predict_proba(pd.DataFrame({'Ge

OK
OK


127.0.0.1 - - [18/Feb/2023 01:49:18] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [18/Feb/2023 01:49:18] "POST /predict HTTP/1.1" 200 -
[2023-02-18 01:49:18,247] ERROR in app: Exception on /predict [POST]
Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_5440\1717074721.py", line 37, in predict
    preds = model.predict_proba(pd.DataFrame({'Ge

OK
OK


127.0.0.1 - - [18/Feb/2023 01:50:28] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [18/Feb/2023 01:50:28] "POST /predict HTTP/1.1" 200 -


OK
OK


127.0.0.1 - - [18/Feb/2023 01:50:51] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [18/Feb/2023 01:50:51] "POST /predict HTTP/1.1" 200 -
[2023-02-18 01:50:51,122] ERROR in app: Exception on /predict [POST]
Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\ASUS\anaconda3\envs\gpu2\lib\site-packages\flask\app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_5440\1717074721.py", line 37, in predict
    preds = model.predict_proba(pd.DataFrame({'Ge

OK
OK
