In [12]:
from flask import Flask, Blueprint, jsonify
import socket
import time
import joblib
import sklearn
import json
import flask
import scipy
import pprint
import redis
import pickle

import logging
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
import sklearn.preprocessing as pp
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import datetime
import pytz
import uuid

import traceback

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def now_localtz():
    return datetime.datetime.now(pytz.timezone('Europe/Lisbon'))

VERSION="20181005-8"
DATE_STARTED=now_localtz()
HOSTNAME=joblib.hash("salted2662"+socket.gethostname())
WORKER_ID=str(uuid.uuid4())

USE_CACHE=False
REDIS_HOST="XXXXXXXXX.redis.cache.windows.net"
REDIS_KEY="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
CACHE_VERSION="v3"

app = Flask(__name__)


# LogicApps configured to send all Msft Forms full form body with question Ids and forms answers
# We map here each question id to sklearn algorithm and pipeline parameters

FORM_IDS={
    "rf71efaaee75f4869b3a24de441b09919":"algorithm",
    "r52e336e1f3564f47b9359debc320a7ce":"nickname",
    "r71b640ccadb844af885b17eb733c4a8b":"logreg__penalty",
    "r462aa2316f0f4819a7bf1e20bd729975":"logreg__C",

    "rd24bc7e764d34b1b83c8d3acf2a91203":"rf__n_estimators",
    "re36df3b7827a41b38742ebab3a9d09d5":"rf__criterion",
    "r189c2542d50b491f8086f02963b7a081":"rf__max_depth",
    "rc9525cd279d14be38771f6f40e4316e4":"rf__min_samples_leaf",
    "rce0c88b0fb9248498ff46de546904f63":"rf__max_features",

    "r6ea09cddd77447e7b5391b31f7945537":"dt__criterion",
    "rba34bf1b855d4ec9a29a74585dcb6bae":"dt__max_depth",
    "rdf6bfb2bd41b405b9dc170e39a6d5154":"dt__min_samples_split",
    "r674ff05de69a4cacbf1e505e55c76281":"dt__min_samples_leaf",
    "r56862ced92a64fd598fbeabc0cbc8d67":"dt__max_features",

    "r2153f109fe2b418795a01c53b500af04":"svm__kernel",
    "rb62cd150e455454c8098824e05a4c8b5":"svm__degree",
    "r96df4ff1491246e4a6d3ad95148f61cb":"svm__C",
    
    "reea67b283e9d446096c7c3ab825169bf":"xt__n_estimators",
    
    "r4849ae4db1c34253b08dac5b9a66de63":"pca",
    
    "r4ca8638f21674388b85b1ffe385a8742":"text_preproc"
}

@app.route("/")
def hello():
    hostname=socket.gethostname()
    return f"App Version: {VERSION}\r\nHostname:{HOSTNAME}\r\nWorker Id:{WORKER_ID}\r\nDate Started:{DATE_STARTED}".replace("\r\n","<br>")



@app.route("/train",methods=["POST"])
def train_route():
    
    time_started = now_localtz()
    
    results={}
    
    try:
        print(flask.request.data)
        raw_params=json.loads(flask.request.data)
        
        # Translate form question keys ids into friendly keys
        form_params={}
        for k in raw_params.keys():
            if FORM_IDS.get(k):
                form_params[FORM_IDS[k]]=raw_params[k]
        
        print(form_params)
        
        # Call inner train
        scores=train(**form_params)        
        
        results["status"]="ok"

        results["scores"]=','.join(str(x) for x in scores)        
        results["score_mean"]=np.mean(scores)
        results["score_std"]=np.std(scores)
        results["score_hmean"]=scipy.stats.hmean(scores)
        
    except Exception as error:
        results["exception"]=str(error)
        results["status"]="error"

        # Numbers cannot be null or "" :(
        results["score_std"]=0
        results["score_hmean"]=0
        results["score_mean"]=0
        
        print("Error: %s"%(error))
        pass
    finally:
        time_ended = now_localtz()
    
    
    results["notes"]=str(form_params)
    results["timestamp"]=time_ended.isoformat()
    
    # Add Hour/time/Second to each submission nickname
    nickname=form_params.get("nickname","-")[0:10]+" ("+now_localtz().strftime("%H_%M_%S")+")"    
    results["nickname"]=nickname
    results["duration_secs"]=(time_ended - time_started).total_seconds()
    
    results["algorithm"]=form_params.get("algorithm","-")
    results["app_version"]=VERSION
    results["host"]=HOSTNAME
    print(results)
    return (jsonify(results))
        

class DataFrameImputer(TransformerMixin):

    def __init__(self, default_value="NA"):
        self.default_value = default_value
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(X).fillna(self.default_value)

# Dataset loads here
df_train=pd.read_csv("train.csv")
y=df_train.pop("Survived")
X=df_train

# Cache (redis)
if USE_CACHE:
    cache = redis.StrictRedis(host=f'{REDIS_HOST}',
            port=6380, db=0, password=f'{REDIS_KEY}', ssl=True)

    cache_ping=cache.ping()

    print("Redis Ping returned : " + str(cache_ping))
else:
    cache=None

# Main train function
def train(**kargs):
      
    random_state=43
    
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1,random_state=random_state)
    
    if kargs["algorithm"]=="Logistic Regression":
        clf=LogisticRegression(random_state=random_state)
        clf_name="logreg"
    
    if kargs["algorithm"]=="Random Forest":
        clf=RandomForestClassifier(random_state=random_state)
        clf_name="rf"
    
    if kargs["algorithm"]=="Decision Tree":
        clf=DecisionTreeClassifier(random_state=random_state)
        clf_name="dt"
    
    if kargs["algorithm"]=="SVM":
        clf=SVC(random_state=random_state)
        clf_name="svm"
    
    if kargs["algorithm"]=="Extra Trees":
        clf=ExtraTreesClassifier(random_state=random_state)
        clf_name="xt"
        
    print("train params",kargs)
    
    pipeline=[]
    
    # Basic post prep pipeline (onehot/remove any remaining NA), make the dataset scikit compliant
    nums=[ ([c],pp.Imputer()) for c in X.select_dtypes(np.number)]
    cats=[ ([c],[DataFrameImputer(default_value=""), pp.LabelBinarizer()]) for c in X.select_dtypes("object")]
    
    texts=[]
    text_preproc=kargs.get("text_preproc")
    if text_preproc and text_preproc!="None":
        if text_preproc=="Tfidf":
            texts=[ ("Name",TfidfVectorizer())]
        elif text_preproc=="Count":
            texts=[ ("Name",CountVectorizer())]
        else:
            raise(Exception(f"not valid:{text_preproc}"))
    
    print(texts)
    mapper=DataFrameMapper(nums+cats+texts,df_out=True)

    pipeline.append(('featurize', mapper))
        
    pca=kargs.get("pca")
    if pca and pca!="Disabled":
        print("add pca")
        pipeline.append(('pca', PCA(n_components=guess_type(kargs["pca"]))))
        
    pipeline.append((clf_name,clf))
    
     # Our full pipeline
    train_pipeline=Pipeline(pipeline)
    
    # Set classifier parameters
    for k in kargs.keys():
        if (clf_name+"__") in k:            
            train_pipeline.set_params(**{k:guess_type(kargs[k])})
    # Dump
    for step in train_pipeline.steps:
        pprint.pprint(step)
        
    # Check cache
    if USE_CACHE:
        cache_key=CACHE_VERSION+"__"+str(joblib.hash(train_pipeline))
        print("Cache key:",cache_key)
        scores=cache.get(cache_key)        
        print("From Cache")
        scores=pickle.loads(scores)
        return scores+np.random.normal(0,.0005,len(scores))*100
    
    print("Not in cache, training...")
    
    # Train/Cross eval
    scores=cross_val_score(X=X,y=y,cv=rskf,estimator=train_pipeline,verbose=5,n_jobs=1,scoring="accuracy")
    scores=(scores*100).round(3)
   
    if USE_CACHE:
        print("Saving in cache...")
        cache.set(cache_key,pickle.dumps(scores))
    
    return scores+np.random.normal(0,.0005,len(scores))*100


def guess_type(s):
    if not isinstance(s,str):
        return s
    if s=="" or s=="None" or s=="none":
        return None
    try:
        if np.isclose(float(s),int(s)):
            return (int(s))
    except:
        try:
            return (float(s))
        except:
            try:
                return (int(s))
            except:
                return str(s)
                pass
            pass
        pass

In [13]:
%%time
np.mean(train(**{"algorithm":"Extra Trees","logreg__C":".05","pca":"5","xt__n_estimators":71,"text_preproc":"Count","nickname":"teste"}))

train params {'algorithm': 'Extra Trees', 'logreg__C': '.05', 'pca': '5', 'xt__n_estimators': 71, 'text_preproc': 'Count', 'nickname': 'teste'}
[('Name', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None))]
add pca
('featurize',
 DataFrameMapper(default=False, df_out=True,
        features=[(['PassengerId'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Pclass'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Age'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['SibSp'], ...   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabular

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ........................ , score=0.801980198019802, total=   0.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ..................................... , score=0.83, total=   0.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


[CV] ....................... , score=0.7676767676767676, total=   0.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


[CV] ....................... , score=0.7474747474747475, total=   0.2s
Wall time: 2.08 s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished


79.76327197186158

In [None]:
if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 9001, app)

INFO:werkzeug: * Running on http://localhost:9001/ (Press CTRL+C to quit)


b'{"responder": "anonymous", "submitDate": "10/3/2018 9:21:44 AM", "rf71efaaee75f4869b3a24de441b09919": "Random Forest", "rd24bc7e764d34b1b83c8d3acf2a91203": "5", "re36df3b7827a41b38742ebab3a9d09d5": "gini", "r189c2542d50b491f8086f02963b7a081": "1", "rc9525cd279d14be38771f6f40e4316e4": "1", "rce0c88b0fb9248498ff46de546904f63": "auto"}'
{'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}
train params {'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}
[]
('featurize',
 DataFrameMapper(default=False, df_out=True,
        features=[(['PassengerId'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Pclass'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Age'], Imputer(axis=0, copy=True, missing_values='NaN', s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ....................... , score=0.6138613861386139, total=   0.0s
[CV]  ................................................................
[CV] ..................................... , score=0.61, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s finished
INFO:werkzeug:127.0.0.1 - - [10/Oct/2018 15:44:48] "POST /train HTTP/1.1" 200 -


[CV] ....................... , score=0.6161616161616161, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6161616161616161, total=   0.0s
{'status': 'ok', 'scores': '61.40957088929663,61.38958632956647,61.003267809224134,61.64679931469443,61.66635600823064', 'score_mean': 61.423116070202454, 'score_std': 0.2394911852230976, 'score_hmean': 61.42217969744601, 'notes': "{'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}", 'timestamp': '2018-10-10T15:44:48.670543+01:00', 'nickname': '- (15_44_48)', 'duration_secs': 0.68398, 'algorithm': 'Random Forest', 'app_version': '20181005-8', 'host': '894b6b96e69789f578ab719a1165c613'}
b'{"responder": "anonymous", "submitDate": "10/3/2018 9:21:44 AM", "rf71efaaee75f4869b3a24de441b09919": "Random Forest", "rd24bc7e764d34b1b83c8d3acf2a91203": "5", "re36df3b7827a41b38742eb

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ....................... , score=0.6138613861386139, total=   0.0s
[CV]  ................................................................
[CV] ..................................... , score=0.61, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
INFO:werkzeug:127.0.0.1 - - [10/Oct/2018 15:45:55] "POST /train HTTP/1.1" 200 -


[CV] ....................... , score=0.6161616161616161, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6161616161616161, total=   0.0s
{'status': 'ok', 'scores': '61.34477156367795,61.430593628610175,61.0347988291965,61.58768701202146,61.6076740742347', 'score_mean': 61.401105021548155, 'score_std': 0.2077050195224182, 'score_hmean': 61.400400592381985, 'notes': "{'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}", 'timestamp': '2018-10-10T15:45:55.959629+01:00', 'nickname': '- (15_45_55)', 'duration_secs': 0.658997, 'algorithm': 'Random Forest', 'app_version': '20181005-8', 'host': '894b6b96e69789f578ab719a1165c613'}
b'{"responder": "anonymous", "submitDate": "10/3/2018 9:21:44 AM", "rf71efaaee75f4869b3a24de441b09919": "Random Forest", "rd24bc7e764d34b1b83c8d3acf2a91203": "5", "re36df3b7827a41b38742eb

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


[CV] ....................... , score=0.6138613861386139, total=   0.0s
[CV]  ................................................................
[CV] ..................................... , score=0.61, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6161616161616161, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
INFO:werkzeug:127.0.0.1 - - [10/Oct/2018 15:46:23] "POST /train HTTP/1.1" 200 -


[CV] ....................... , score=0.6161616161616161, total=   0.0s
{'status': 'ok', 'scores': '61.37667165367228,61.39744031312173,60.98104678619406,61.660509762392074,61.68809614383142', 'score_mean': 61.42075293184231, 'score_std': 0.2548661855129008, 'score_hmean': 61.41969257141894, 'notes': "{'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}", 'timestamp': '2018-10-10T15:46:23.125369+01:00', 'nickname': '- (15_46_23)', 'duration_secs': 0.564011, 'algorithm': 'Random Forest', 'app_version': '20181005-8', 'host': '894b6b96e69789f578ab719a1165c613'}
b'{"responder": "anonymous", "submitDate": "10/3/2018 9:21:44 AM", "rf71efaaee75f4869b3a24de441b09919": "Random Forest", "rd24bc7e764d34b1b83c8d3acf2a91203": "5", "re36df3b7827a41b38742ebab3a9d09d5": "gini", "r189c2542d50b491f8086f02963b7a081": "1", "rc9525cd279d14be38771f6f40e4316e4": "1", "rce0c88b0fb9248498ff46de546904f63": 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ....................... , score=0.6138613861386139, total=   0.0s
[CV]  ................................................................
[CV] ..................................... , score=0.61, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
INFO:werkzeug:127.0.0.1 - - [10/Oct/2018 15:46:26] "POST /train HTTP/1.1" 200 -


[CV] ....................... , score=0.6161616161616161, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6161616161616161, total=   0.0s
{'status': 'ok', 'scores': '61.39183024385446,61.401674027729875,61.01414144252807,61.54358835913396,61.68130365676106', 'score_mean': 61.40650754600149, 'score_std': 0.22293227539759977, 'score_hmean': 61.405696223173194, 'notes': "{'algorithm': 'Random Forest', 'rf__n_estimators': '5', 'rf__criterion': 'gini', 'rf__max_depth': '1', 'rf__min_samples_leaf': '1', 'rf__max_features': 'auto'}", 'timestamp': '2018-10-10T15:46:26.452368+01:00', 'nickname': '- (15_46_26)', 'duration_secs': 0.612002, 'algorithm': 'Random Forest', 'app_version': '20181005-8', 'host': '894b6b96e69789f578ab719a1165c613'}
