# Typeform: ML Case (1): Pipeline prototype
#### with SciKit Learn Pipeline ready to deploy

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

## Typeform: Pipeline: Data Processing

In [4]:
df_typeform = pd.read_csv("./data/typeform.csv", header=None)
df_typeform.columns = ["form_id", "submissions", "view", "features"]
df = df_typeform.copy()

In [5]:
df = df[df.view > 0.0].copy()

In [6]:
# split features
df['features'] = \
    df.features.apply(lambda x : x.split('-'))
# unstack
features = df_typeform.features.apply(pd.Series)
features = features.rename(
    columns = lambda x : 'feature_' + str(x))
print(features.columns)

Index(['feature_0'], dtype='object')


In [None]:
# cleaning (sure there's a better way)
features['feature_46'] = \
    features['feature_46'].apply(lambda x : x[:-1])


In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class DataProcessing(TransformerMixin, BaseEstimator):
    """ Perform data processing on a csv file
    Arguments:
        df - pandas.DataFrame with features and output
    Returns: transformed dataframe 
    See https://scikit-learn.org/stable/modules/generated/
        sklearn.base.TransformerMixin.html for the details
    """

    def __init__(self):
        pass

    def cleaning(self, df):
        # clean zero views
        df = df[df.view > 0.0].copy()
        return df.copy()

    def transform_features(self, df):
        # split features
        df['features'] = \
            df.features.apply(lambda x : x.split('-'))
        # unstack
        features = df_typeform.features.apply(pd.Series)
        features = features.rename(
            columns = lambda x : 'feature_' + str(x))
        print(features.columns)
        # cleaning (sure there's a better way)
        features['feature_46'] = \
            features['feature_46'].apply(lambda x : x[:-1])
        return features

    def transform_output(self, df):
        # clean zero views
        df = df[df.view > 0.0].copy()
        # calculate ouyput
        df['completion_rate'] = df.submissions / df.view
        return df[['completion_rate']].copy()
    
    def fit(self):
        pass
    
    def transform(self, df):
        df = self.cleaning(df)
        features = self.transform_features(df)
        output = self.transform_output(df)
        df_res = pd.concat([output, features[:]], axis=1)
        df_res = df_res.astype(float)
        return df_res

In [None]:
tp = DataProcessing()

In [None]:
df_clean = tp.transform(df=df_typeform)

In [None]:
df_clean.head()

## Typeform: Model re-training pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 

pipeline = Pipeline([
    ('transforms', TextProcessing()),
    ('regression', RandomForestRegressor())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
pipeline.fit(X_train, y_train)

pipeline.get_params()

In [None]:
# add grid search over the entire pipeline
params = [
    {
        'regression': [RandomForestRegressor()],
        'regression__n_estimators': [100, 500, 1000],
    }
]

grid = GridSearchCV(pipeline, param_grid=params, cv=5)
grid.fit(X_train, y_train)

grid.get_params()

Saving the grid pipeline

## Typeform: Model predicting method

In [None]:
import pandas as pd

df = pd.read_csv("./data/typeform.csv",header=None)
df.head(5).to_csv("./data/typeform5.csv",header=None)