In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("tendency_to_obesity.csv")
df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [3]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

# Кодирование значений целевой переменной

In [4]:
le = LabelEncoder()
y = le.fit_transform(df.NObeyesdad)

In [5]:
X = df.drop(['NObeyesdad'], axis = 1)

# Избавление от категориальности переменных

In [6]:
categorical_columns = ['Gender', 'family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']

In [7]:
for i in categorical_columns:
    print (X[i].unique())

['Female' 'Male']
['yes' 'no']
['no' 'yes']
['Sometimes' 'Frequently' 'Always' 'no']
['no' 'yes']
['no' 'yes']
['no' 'Sometimes' 'Frequently' 'Always']
['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']


In [8]:
X = pd.get_dummies(X, columns = categorical_columns)

In [9]:
X

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.000000,1.620000,64.000000,2.0,3.0,2.000000,0.000000,1.000000,1,0,...,0,0,0,0,1,0,0,0,1,0
1,21.000000,1.520000,56.000000,3.0,3.0,3.000000,3.000000,0.000000,1,0,...,1,0,0,1,0,0,0,0,1,0
2,23.000000,1.800000,77.000000,2.0,3.0,2.000000,2.000000,1.000000,0,1,...,0,0,1,0,0,0,0,0,1,0
3,27.000000,1.800000,87.000000,3.0,3.0,2.000000,2.000000,0.000000,0,1,...,0,0,1,0,0,0,0,0,0,1
4,22.000000,1.780000,89.800000,2.0,1.0,2.000000,0.000000,0.000000,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.0,3.0,1.728139,1.676269,0.906247,1,0,...,0,0,0,1,0,0,0,0,1,0
2107,21.982942,1.748584,133.742943,3.0,3.0,2.005130,1.341390,0.599270,1,0,...,0,0,0,1,0,0,0,0,1,0
2108,22.524036,1.752206,133.689352,3.0,3.0,2.054193,1.414209,0.646288,1,0,...,0,0,0,1,0,0,0,0,1,0
2109,24.361936,1.739450,133.346641,3.0,3.0,2.852339,1.139107,0.586035,1,0,...,0,0,0,1,0,0,0,0,1,0


In [35]:
X.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking'],
      dtype='object')

# Нормировка данных

In [10]:
scaler = StandardScaler()
scaler.fit_transform(X)

array([[-0.52212439, -0.87558934, -0.86255819, ..., -0.07237469,
         0.57972058, -0.16507758],
       [-0.52212439, -1.94759928, -1.16807699, ..., -0.07237469,
         0.57972058, -0.16507758],
       [-0.20688898,  1.05402854, -0.36609013, ..., -0.07237469,
         0.57972058, -0.16507758],
       ...,
       [-0.28190933,  0.54167211,  1.79886776, ..., -0.07237469,
         0.57972058, -0.16507758],
       [ 0.00777624,  0.40492652,  1.78577968, ..., -0.07237469,
         0.57972058, -0.16507758],
       [-0.10211908,  0.39834438,  1.7905916 , ..., -0.07237469,
         0.57972058, -0.16507758]])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
import dill as pickle
filename_norm = 'model_Obesity_Normal.pk'

In [21]:
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename_norm, 'wb') as file:
	pickle.dump(scaler, file)

In [23]:
# считывание модели из файла
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename_norm ,'rb') as f:
    loaded_model_Normal = pickle.load(f)

In [24]:
loaded_model_Normal.transform(X.iloc[[10]])

array([[ 0.26596413,  1.59003351,  0.70322569,  1.08834176,  0.40415272,
         1.61875854,  1.16382038,  2.20461814, -0.98822657,  0.98822657,
        -0.47229133,  0.47229133, -0.36234913,  0.36234913, -0.16047791,
         2.77905389, -2.2585739 , -0.15734447,  0.14590027, -0.14590027,
         0.21827203, -0.21827203, -0.02177002, -0.18519426,  0.71188543,
        -0.65886513, -0.52564235, -0.05768012, -0.07237469,  0.57972058,
        -0.16507758]])

In [12]:
target_class = df.NObeyesdad.unique()

In [13]:
estimators = [('log',  LogisticRegression(solver="newton-cg", random_state=42, C = 115)),\
              ("dtr",  DecisionTreeClassifier(max_depth = 13, random_state=42))]

In [14]:
%%time
stack_clf = StackingClassifier(estimators = estimators, final_estimator = DecisionTreeClassifier(max_depth = 13, random_state=42), cv = 4, n_jobs = -1, verbose = 3,  passthrough = True)

CPU times: user 55 µs, sys: 1 µs, total: 56 µs
Wall time: 62.9 µs


In [15]:
%%time
stack_clf.fit(X_train, y_train)

CPU times: user 77.7 ms, sys: 60.4 ms, total: 138 ms
Wall time: 19.1 s


StackingClassifier(cv=4,
                   estimators=[('log',
                                LogisticRegression(C=115, random_state=42,
                                                   solver='newton-cg')),
                               ('dtr',
                                DecisionTreeClassifier(max_depth=13,
                                                       random_state=42))],
                   final_estimator=DecisionTreeClassifier(max_depth=13,
                                                          random_state=42),
                   n_jobs=-1, passthrough=True, verbose=3)

In [16]:
report = classification_report(y_test, stack_clf.predict(X_test), target_names=target_class)
print(report)

                     precision    recall  f1-score   support

      Normal_Weight       0.92      1.00      0.96        56
 Overweight_Level_I       0.95      0.85      0.90        62
Overweight_Level_II       0.94      0.96      0.95        78
     Obesity_Type_I       0.96      0.95      0.96        58
Insufficient_Weight       1.00      1.00      1.00        63
    Obesity_Type_II       0.91      0.89      0.90        56
   Obesity_Type_III       0.92      0.94      0.93        50

           accuracy                           0.94       423
          macro avg       0.94      0.94      0.94       423
       weighted avg       0.94      0.94      0.94       423



In [26]:
import os 
import json
from joblib import dump, load

In [28]:
filename = 'model_Obesity.pk' # имя файла для сохранения модели
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename, 'wb') as file:
	pickle.dump(stack_clf, file)
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)

In [29]:
target_asced = le.fit_transform(target_class)

In [30]:
le.inverse_transform(loaded_model.predict(X_test.iloc[[2]]))

array(['Insufficient_Weight'], dtype=object)

In [31]:
X_test.iloc[[2]]

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
420,18.0,1.85,60.0,3.0,4.0,2.0,2.0,0.0,0,1,...,1,0,0,1,0,1,0,0,0,0


In [32]:
# оценка качества модели по X_test
print("Validation set score: {:.2f}".format(stack_clf.score(X_test, y_test)))

Validation set score: 0.94


In [None]:
from flask import Flask, request, jsonify
import json
import dill as pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# creating a Flask application
app = Flask(__name__)

# Load the model Normalize
filename_norm = 'model_Obesity_Normal.pk'
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename_norm ,'rb') as f:
    loaded_model_Normal = pickle.load(f)


# Load the model
filename = 'model_Obesity.pk'
with open('/Users/danilapadarouski/Desktop/DataScience /Ноутбуки/DZ_5/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)

# creating target
target_class = np.array(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
       'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
       'Obesity_Type_III'], dtype=object)
le = LabelEncoder()
target_asced = le.fit_transform(target_class)

# creating predict url and only allowing post requests.
@app.route('/predict', methods=['POST'])
def predict():
    # Get data from Post request
    data = request.get_json()
    # converting a json request to the model format
    df = pd.read_json(data, orient='split')
    print('\n\n\n Request  \n', df.head())
    # Make prediction
    df_norm = pd.DataFrame(data=loaded_model_Normal.transform(df),
              columns=['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking'])
    pred = le.inverse_transform(loaded_model.predict(df_norm))
    print('\n\n\n Prediction   ', pred, '\n\n')
    # returning a prediction as json
    responses = pd.Series(pred).to_json(orient='values')
    return (responses)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=3000, debug=True)

In [36]:
import json
import requests

In [37]:
header = {'Content-Type': 'application/json', \
                  'Accept': 'application/json'}

In [38]:
# id строчки запроса из теста
id_zap = 10

In [39]:
# запрос json для предсказания
data_zap = X.iloc[[id_zap]].to_json(orient="split")
data_zap

'{"columns":["Age","Height","Weight","FCVC","NCP","CH2O","FAF","TUE","Gender_Female","Gender_Male","family_history_with_overweight_no","family_history_with_overweight_yes","FAVC_no","FAVC_yes","CAEC_Always","CAEC_Frequently","CAEC_Sometimes","CAEC_no","SMOKE_no","SMOKE_yes","SCC_no","SCC_yes","CALC_Always","CALC_Frequently","CALC_Sometimes","CALC_no","MTRANS_Automobile","MTRANS_Bike","MTRANS_Motorbike","MTRANS_Public_Transportation","MTRANS_Walking"],"index":[10],"data":[[26.0,1.85,105.0,3.0,3.0,3.0,2.0,2.0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0]]}'

In [40]:
# запрос в виде dataframe 
X.iloc[[id_zap]]

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
10,26.0,1.85,105.0,3.0,3.0,3.0,2.0,2.0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [41]:
# для примера обратное преобразование json в dataframe
df = pd.read_json(data_zap, orient='split')
df

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
10,26,1.85,105,3,3,3,2,2,0,1,...,0,0,0,1,0,0,0,0,1,0


In [42]:
"""POST <url>/predict
"""
resp = requests.post("http://127.0.0.1:3000/predict", \
                    data = json.dumps(data_zap),\
                    headers= header)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=3000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ffcb8e5a950>: Failed to establish a new connection: [Errno 61] Connection refused'))