### Домашняя работа к Уроку 9 - Итоговый Проект
### Студент: Абрамов А.В.

In [60]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import time
import sklearn.datasets

from flask import Flask, request, jsonify

## Chapter 1. Pipeline training

In [61]:
# Загрузим данные (будем использовать датасет про диабет)
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0


In [62]:
df['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [63]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='diabetes'), df['diabetes'], test_size=0.3, random_state=42)

# save test
X_test.to_csv('X_test.csv', index=None)
y_test.to_csv('y_test.csv', index=None)

# save train
X_train.to_csv('X_train.csv', index=None)
y_train.to_csv('y_train.csv', index=None)

In [64]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [65]:
features = ['bmi', 'HbA1c_level', 'blood_glucose_level']
target = 'diabetes'

scaled_features = []

In [66]:
for feature in features:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=feature)),
                ('standard', StandardScaler())
            ])
    scaled_features.append((feature, transfomer))

In [67]:
scaled_features

[('bmi',
  Pipeline(steps=[('selector', NumberSelector(key='bmi')),
                  ('standard', StandardScaler())])),
 ('HbA1c_level',
  Pipeline(steps=[('selector', NumberSelector(key='HbA1c_level')),
                  ('standard', StandardScaler())])),
 ('blood_glucose_level',
  Pipeline(steps=[('selector', NumberSelector(key='blood_glucose_level')),
                  ('standard', StandardScaler())]))]

In [68]:
feats = FeatureUnion(scaled_features)

In [69]:
feature_processing = Pipeline([('feats', feats)])

feature_processing.fit_transform(X_train)

array([[-6.19069481e-04, -4.92624842e-01,  4.14557732e-01],
       [-6.19069481e-04, -1.89423192e+00,  1.69109989e-01],
       [-6.19069481e-04, -1.89423192e+00, -1.99061626e-01],
       ...,
       [-1.79350769e-01,  2.54898933e-01, -1.30357647e+00],
       [-3.58082468e-01,  6.28660821e-01,  4.88192055e-01],
       [ 1.00864183e-01, -4.92624842e-01,  5.12736829e-01]])

In [70]:
# Добавляем классификатор Logistic Regression

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

In [71]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('bmi',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='bmi')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('HbA1c_level',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='HbA1c_level')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('blood_glucose_level',
                                                 Pipeline(steps=[('selector',
                 

In [72]:
# Наш pipeline пошагово
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('bmi',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='bmi')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('HbA1c_level',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='HbA1c_level')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('blood_glucose_level',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='blood_glucose_level')),
                                                  ('standard',
                                                   Sta

In [73]:
with open('diabetes_logreg_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

## Chapter 2. Prediction testing

In [74]:
X_test.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
75721,Female,13.0,0,0,No Info,20.82,5.8,126
80184,Female,3.0,0,0,No Info,21.0,5.0,145
19864,Male,63.0,0,0,former,25.32,3.5,200


In [75]:
with open('diabetes_logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [76]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})

In [77]:
pred_df.to_csv("test_diabetes_predictions.csv", index=None)

In [78]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.5004712543631252, F-Score=0.696, Precision=0.906, Recall=0.565


## Chapter 3. Flask

In [None]:
app = Flask(__name__)
#run_with_ngrok(app)  # Start ngrok when app is run

@app.route("/", methods=["GET"])
def start():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    bmi, HbA1c_level, blood_glucose_level = 0, 0, 0
    
    request_json = request.get_json()
    
    if request_json['bmi']:
        bmi = request_json['bmi']
    
    if request_json['HbA1c_level']:
        HbA1c_level = request_json['HbA1c_level']
                
    if request_json['blood_glucose_level']:
        blood_glucose_level = request_json['blood_glucose_level']
    
    print(blood_glucose_level)  
    preds = pipeline.predict_proba(pd.DataFrame({'bmi': [bmi],
                                              'HbA1c_level': [HbA1c_level],
                                              'blood_glucose_level': [blood_glucose_level]}))
    data['predictions'] = preds[:, 1][0]
    data['blood_glucose_level'] = blood_glucose_level
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run(port=8080)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8080/ (Press CTRL+C to quit)
127.0.0.1 - - [19/Jul/2023 00:59:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 00:59:20] "POST //predict HTTP/1.1" 200 -


180
OK


127.0.0.1 - - [19/Jul/2023 00:59:44] "POST //predict HTTP/1.1" 200 -


200
OK


127.0.0.1 - - [19/Jul/2023 01:01:33] "POST //predict HTTP/1.1" 200 -


200
OK


127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:47] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:07:4

126.0
OK
145.0
OK
200.0
OK
126.0
OK
200.0
OK
200.0
OK
158.0
OK
158.0
OK
145.0
OK
90.0
OK
130.0
OK
90.0
OK
80.0
OK
200.0
OK
130.0
OK
126.0
OK
159.0
OK
100.0
OK
200.0
OK
160.0
OK


127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:31] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2023 01:08:3

126.0
OK
145.0
OK
200.0
OK
126.0
OK
200.0
OK
200.0
OK
158.0
OK
158.0
OK
145.0
OK
90.0
OK
130.0
OK
90.0
OK
80.0
OK
200.0
OK
130.0
OK
126.0
OK
159.0
OK
100.0
OK
200.0
OK
160.0
OK


In [None]:
# Пример данных
bmi, HbA1c_level, blood_glucose_level = (26.7, 6.7, 180)

body = {
        'bmi': bmi, 
        'HbA1c_level': HbA1c_level,
        'blood_glucose_level': blood_glucose_level
        }

In [None]:
with app.test_client() as t:
    response = t.post('/predict', json=body)
    json_data = response.get_json()

json_data