# Typeform: ML Case (1): SKlearn Pipeline prototype
#### with SciKit Learn Pipeline ready to deploy

Here it is a prototype ready to deploy on a docker image that integrates with an AWS SNS, SQS API architeture.

The current pipeline is thouhgt to be ran on a AWS EMR communicating via endponts (as a service) being able to scale as much as you need. (caviat: now it runs reading a file from the filesystem but it's final integration should be with messaging or real-time sqs queues subbscription)

There are two processes:
- Re-traing: Uses gridsearch for hyperparmeter tunning.
- Prediction: Reads the features and the model and does a prediction

All it's been integrated on a sample docker container (see docker folder)

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

## Typeform: Pipeline: Data Processing

In [18]:
df_typeform = pd.read_csv("./data/typeform.csv", header=None)
df_typeform.columns = ["form_id", "submissions", "view", "features"]
df_typeform.head()

Unnamed: 0,form_id,submissions,view,features
0,(1113027,33,27,0.0-0.0-0.0-0.0-0.0-0.0-1.0-0.0-1.0-0.0-0.0-0....
1,(1115313,147,111,0.0-2.0-0.0-0.0-0.0-0.0-0.0-0.0-1.0-0.0-0.0-0....
2,(1115907,528,136,0.0-1.0-0.0-0.0-1.0-0.0-6.0-0.0-1.0-0.0-0.0-0....
3,(1116299,55,21,0.0-2.0-0.0-0.0-0.0-1.0-2.0-0.0-2.0-0.0-0.0-0....
4,(1120373,62,54,0.0-0.0-0.0-0.0-1.0-0.0-4.0-0.0-0.0-0.0-0.0-0....


In [66]:
df_typeform_ = df_typeform.sample(n=10000)
df_typeform_.shape

(10000, 4)

In [67]:
def prepare_dataframe(df):
    # clean zero views
    df_ = df[df.view > 0.0].copy()
    # calculate ouyput
    df_['completion_rate'] = df_.submissions / df_.view
    return df_

In [76]:
df_output = prepare_dataframe(df_typeform_)

In [71]:
from sklearn.base import TransformerMixin, BaseEstimator

class DataProcessing(TransformerMixin, BaseEstimator):
    """ Perform data processing on a csv file
    Arguments:
        df - pandas.DataFrame with features and output
    Returns: transformed dataframe 
    See https://scikit-learn.org/stable/modules/generated/
        sklearn.base.TransformerMixin.html for the details
    """

    def __init__(self):
        pass


    def transform_features(self, df):
        # split features
        df['features'] = \
            df.features.apply(lambda x : x.split('-'))
        # unstack
        features = df.features.apply(pd.Series)
        features = features.rename(
            columns = lambda x : 'feature_' + str(x))
        print(features.columns)
        # cleaning (sure there's a better way)
        features['feature_46'] = \
            features['feature_46'].apply(lambda x : x[:-1])
        return features

    def fit(self):
        pass
    
    def transform(self, df):
        features = self.transform_features(df)
        return features

In [72]:
tp = DataProcessing()

In [73]:
df_clean = tp.transform(df=df_typeform_)

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46'],
      dtype='object')


In [75]:
df_clean.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46
41772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
920157,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
41594,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
271286,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
900099,1.0,1.0,0.0,0.0,0.0,0.0,12.0,0.0,7.0,0.0,...,0.0,0.0,0.0,6.0,1.0,1.0,0.0,38.0,47.0,85.0


## Typeform: Model re-training pipeline

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [81]:
X = df_clean
y = df_output['completion_rate'].values

In [84]:
pipeline = Pipeline([
    #('transforms', DataProcessing()),
    ('regression', RandomForestRegressor())
])

X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)

pipeline.get_params()



{'memory': None,
 'steps': [('regression',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=None, verbose=0, warm_start=False))],
 'regression': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'regression__bootstrap': True,
 'regression__criterion': 'mse',
 'regression__max_depth': None,
 'regression__max_features': 'auto',
 'regress

In [85]:
# adding grid search over to the pipeline
params = [
    {
        'regression': [RandomForestRegressor()],
        'regression__n_estimators': [100, 500, 1000],
    }
]

grid = GridSearchCV(pipeline, param_grid=params, cv=5)
grid.fit(X_train, y_train)

grid.get_params()

{'cv': 5,
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('regression',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
              oob_score=False, random_state=None, verbose=0, warm_start=False))],
 'estimator__regression': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'estimator__regression__bootstrap': True,
 'estimator__regress

Saving the grid pipeline

In [86]:
from joblib import dump, load

In [87]:
dump(grid, './models/grid_pipeline_RandomForestReg.joblib') 

['./models/grid_pipeline_RandomForestReg.joblib']

## Typeform: Model predicting method

In [88]:
y_pred = grid.predict(X_test)

In [89]:
MAE = mean_absolute_error(y_test , y_pred)
print('Random forest validation MAE = ', MAE)

Random forest validation MAE =  3.5329050398085178


MAE is much worst because it was trained with a sample (to make the point)