## Machine Learning models as APIs using Flask

### 1. Python Environment Setup & Flask Basics

### 2. Creating a Machine Learning Model

In [1]:
import os 
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")



- 数据集：训练集和测试集

In [2]:
!ls ../data/

test.csv     training.csv


In [3]:
data = pd.read_csv('../data/training.csv')

In [4]:
list(data.columns)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [5]:
data.shape

(614, 13)

- 找到列中的缺失值

In [6]:
for _ in data.columns:
    print("The number of null values in:{} == {}".format(_, data[_].isnull().sum()))

The number of null values in:Loan_ID == 0
The number of null values in:Gender == 13
The number of null values in:Married == 3
The number of null values in:Dependents == 15
The number of null values in:Education == 0
The number of null values in:Self_Employed == 32
The number of null values in:ApplicantIncome == 0
The number of null values in:CoapplicantIncome == 0
The number of null values in:LoanAmount == 22
The number of null values in:Loan_Amount_Term == 14
The number of null values in:Credit_History == 50
The number of null values in:Property_Area == 0
The number of null values in:Loan_Status == 0


- 创建 `training` 和 `testing` 数据集:

In [7]:
pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

X_train, X_test, y_train, y_test = train_test_split(data[pred_var], 
                                                    data['Loan_Status'],
                                                    test_size=0.25, 
                                                    random_state=42)

- 把处理过程写到一个类中`pre-processing` 

__custom pre-processing Scikit-learn `estimator`__

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """
    
    def __init__(self):
        pass

    def transform(self, df):
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',\
                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}
        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
        
        df.replace(
            {
                'Gender': gender_values, 
                'Married': married_values, 
                'Education': education_values,
                'Self_Employed': employed_values, 
                'Property_Area': property_values,
                'Dependents': dependent_values           
            }, inplace=True
        )
        
        return df.as_matrix()

    def fit(self, df, y=None, **fit_params):        
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self

- 把`y_train` 和 `y_test` 转换后才 `np.array`:

In [9]:
y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

使用管道去确保整个数据预处理流程做到一个`scikit-learn estimator`.

In [10]:
pipe = make_pipeline(PreProcessing(),RandomForestClassifier())

In [11]:
pipe

Pipeline(memory=None,
         steps=[('preprocessing', PreProcessing()),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

- 使用网格搜索法选择参数

使用`Grid Search`，搜索最佳`hyper-parameters` (`degree` for `PolynomialFeatures` & `alpha` for `Ridge`):

- 定义`param_grid`:

In [12]:
param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
             "randomforestclassifier__max_depth" : [None, 6, 8, 10],
             "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
             "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

- 运行`Grid Search`:

In [13]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)

- 训练模型 `pipeline estimator`:

In [14]:
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing', PreProcessing()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                        

- 查看Grid Search选择的最佳参数和分数:

In [15]:
print("Best parameters: {}".format(grid.best_params_))

Best parameters: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_leaf_nodes': None, 'randomforestclassifier__min_impurity_split': 0.3, 'randomforestclassifier__n_estimators': 10}


In [16]:
print("Validation set score: {:.2f}".format(grid.score(X_test, y_test)))

Validation set score: 0.77


- 加载测试集:

In [17]:
test_df = pd.read_csv('../data/test.csv', encoding="utf-8")
test_df = test_df.head()

In [18]:
test_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [19]:
grid.predict(test_df)

array([1, 1, 1, 1, 1])

### __Serialize the Machine Learning Model__

### 3. 保存机器学习模型：序列化和反序列化

python 中一般是使用pickle模块来实现序列化和反序列化：

- 序列化是指将一个对象转换为一个能够存储在一个文件中或者网络上进行传输的字节流的过程。
- 反序列化指的是相反的过程，它是从字节流中提取对象的过程。

In [20]:
list_to_pickle = [1, 'here', 123, 'walker']

#Pickling the list
import pickle

# 序列化
list_pickle = pickle.dumps(list_to_pickle)

In [21]:
list_pickle

b'\x80\x03]q\x00(K\x01X\x04\x00\x00\x00hereq\x01K{X\x06\x00\x00\x00walkerq\x02e.'

When we load the pickle back:

In [22]:
# 反序列化
loaded_pickle = pickle.loads(list_pickle)

In [23]:
loaded_pickle

[1, 'here', 123, 'walker']

NOTE：
* 在我们实际部署机器学习模型的过程中，一般是把训练好的模型序列化到一个文件夹中(一般使用pickle和h5py)
* dill将python用于序列化和反序列化python对象的pickle模块扩展到大多数内置python类型。比如嵌套函数类型的对象pickle不可以存储，但dill可以。dill提供和pickle相同的接口，使用时，“import dill as pickle”即可。

In [24]:
!pip install dill

Looking in indexes: http://mirrors.aliyun.com/pypi/simple/


In [26]:
import dill as pickle
filename = 'model_v2.pk'

In [27]:
with open('../flask_api/models/'+filename, 'wb') as file:
    pickle.dump(grid, file)

这样我们就将我们训练的最佳模型给序列化出来了`model_v2.pk`,我们在Flask使用前先测试下。。。

In [28]:
with open('../flask_api/models/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)
loaded_model.predict(test_df)

array([1, 1, 1, 1, 1])

### 4. Creating an API using Flask

我们的文件目录结构如下图所示：

![Folder Struct](images/flaskapp3.png)

There are three important parts in constructing our wrapper function, **`apicall()`**:

- Getting the **`request`** data (for which predictions are to be made)

- Loading our **`pickled estimator`**

- **`jsonify`** our predictions and send the response back with **`status code: 200`**

HTTP messages are made of a header and a body. As a standard, majority of the body content sent across are in **`json`** format. We'll be sending (**`POST url-endpoint/`**) the incoming data as batch to get predictions.

(__NOTE:__ You can send plain **text, XML, csv or image** directly but for the sake of interchangeability of the format, it is advisable to use **`json`**)

```python
"""Filename: server.py
"""

import os
import pandas as pd
from sklearn.externals import joblib
from flask import Flask, jsonify, request

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def apicall():
	"""API Call
	
	Pandas dataframe (sent as a payload) from API Call
	"""
	try:
		test_json = request.get_json()
		test = pd.read_json(test_json, orient='records')

		#To resolve the issue of TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'
		test['Dependents'] = [str(x) for x in list(test['Dependents'])]

		#Getting the Loan_IDs separated out
		loan_ids = test['Loan_ID']

	except Exception as e:
		raise e
	
	clf = 'model_v2.pk'
	
	if test.empty:
		return(bad_request())
	else:
		#Load the saved model
		print("Loading the model...")
		loaded_model = None
		with open('./models/'+clf,'rb') as f:
			loaded_model = pickle.load(f)

		print("The model has been loaded...doing predictions now...")
		predictions = loaded_model.predict(test)
		
		"""Add the predictions as Series to a new pandas dataframe
								OR
		   Depending on the use-case, the entire test data appended with the new files
		"""
		prediction_series = list(pd.Series(predictions))

		final_predictions = pd.DataFrame(list(zip(loan_ids, prediction_series)))
		
		"""We can be as creative in sending the responses.
		   But we need to send the response codes as well.
		"""
		responses = jsonify(predictions=final_predictions.to_json(orient="records"))
		responses.status_code = 200

		return (responses)

```

Once done, run: `gunicorn --bind 0.0.0.0:8000 server:app`

Let's generate some prediction data and query the API running locally at 

`https:0.0.0.0:8000/predict`

In [29]:
import json
import requests

In [30]:
"""Setting the headers to send and accept json responses
"""
header = {'Content-Type': 'application/json', 
          'Accept': 'application/json'}

"""Reading test batch
"""
df = pd.read_csv('../data/test.csv', encoding="utf-8-sig")
df = df.head()

"""Converting Pandas Dataframe to json
"""
data = df.to_json(orient='records')

'Setting the headers to send and accept json responses\n'

'Reading test batch\n'

'Converting Pandas Dataframe to json\n'

In [31]:
data

'[{"Loan_ID":"LP001015","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5720,"CoapplicantIncome":0,"LoanAmount":110.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001022","Gender":"Male","Married":"Yes","Dependents":"1","Education":"Graduate","Self_Employed":"No","ApplicantIncome":3076,"CoapplicantIncome":1500,"LoanAmount":126.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001031","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5000,"CoapplicantIncome":1800,"LoanAmount":208.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001035","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":2340,"CoapplicantIncome":2546,"LoanAmount":100.0,"Loan_Amount_Term":360.0,"Credit_History":null,"Property_Are

In [None]:
"""POST <url>/predict
"""
resp = requests.post("http://0.0.0.0:8000/predict", \
                    data = json.dumps(data),\
                    headers= header)

In [35]:
resp.status_code

200

In [36]:
"""The final response we get is as follows:
"""


{'predictions': '[{"0":"LP001015","1":1},{"0":"LP001022","1":1},{"0":"LP001031","1":1},{"0":"LP001035","1":1},{"0":"LP001051","1":1}]'}