In [159]:
import pickle
import json
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [2]:
pipeline = joblib.load('pipeline.pickle')
with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)

with open('columns.json', 'r') as fh:
    columns = json.load(fh)

# Build pipeline

In [3]:
!pwd

/mnt/c/Users/Alexandre/Google Drive/Documents/LDSSA/heroku-model-deploy


In [153]:
df_x = pd.read_csv('../batch2-capstone/data/X_train.csv')
df_x["age_in_years"] = df_x["age_in_years"].astype(np.float)

y = np.genfromtxt('../batch2-capstone/data/y_train.csv')

In [154]:
class DataCleaner(BaseEstimator, TransformerMixin):

    def __init__(self, n_min=30):
        self.other_factors_cols=["other_factor_1",
                     "other_factor_2",
                     "other_factor_3"]
        self.n_min = n_min
        
    def transform(self, X):
        df_other_factors = X[self.other_factors_cols].copy().fillna("N/A or Unknown")
        
        ohe = np.zeros([X.shape[0], len(self.factors)])
        for key, row in X.iterrows():
            for el in row:
                if el in self.factors:
                    ohe[key][self.factors.index(el)] = 1
                
        df_factors_ohe = pd.DataFrame(ohe, columns = self.factors, dtype=np.int32)
        
        df_X_clean = pd.concat([X, df_factors_ohe],axis=1).drop(columns = self.other_factors_cols)

        # convert all columns to int
        df_X_clean["male"] = df_X_clean["m_or_f"].apply(self.convert_m_or_f)
        
        #m_or_f was ambiguous, NaNs are unnecessary
        df_X_clean = df_X_clean.drop(columns=["m_or_f","N/A or Unknown"])

        df_X_clean = self.enc.transform(df_X_clean)

        # drop features that only appeared n_min or less times in the trainning set
        X_transformed = df_X_clean[self.usable_cols].copy()
        
        return X_transformed

    def fit(self, df_x, y):
        df_other_factors = df_x[self.other_factors_cols].copy().fillna("N/A or Unknown")
        
        # list of possible factors
        self.factors = list(np.unique(df_other_factors.values))
        
        ohe = np.zeros([df_x.shape[0], len(self.factors)])
        for key, row in df_other_factors.iterrows():
            for el in row:
                ohe[key][self.factors.index(el)] = 1

        df_factors_ohe = pd.DataFrame(ohe, columns = self.factors, dtype=np.int32)
        # drop factors with few observations
        self.factors = list(df_factors_ohe.columns[(df_factors_ohe.sum()>self.n_min)].values)
        df_factors_ohe = df_factors_ohe[self.factors]

        df_X_clean = pd.concat([df_x, df_factors_ohe],axis=1).drop(columns = self.other_factors_cols)

        # convert all columns to int
        df_X_clean["male"] = df_X_clean["m_or_f"].apply(self.convert_m_or_f)
        
        #m_or_f was ambiguous
        df_X_clean = df_X_clean.drop(columns=["m_or_f","N/A or Unknown"])
        self.enc = category_encoders.one_hot.OneHotEncoder(handle_unknown='ignore')
        df_X_clean = self.enc.fit_transform(df_X_clean)
        
        # drop features that only appear 3 or less times
        self.usable_cols = df_X_clean.columns[df_X_clean.sum()>self.n_min].values
        self.X_train = df_X_clean[self.usable_cols].copy()
        self.y_train = y.copy()
        return self
    
    def convert_m_or_f(self, val):
        if val == 'm':
            return 1
        elif val == 'f':
            return 0
        else:
            return np.nan


In [155]:
cleaner = DataCleaner(50).fit(df_x,y)

In [156]:
asd=pd.DataFrame.from_dict(
{'m_or_f': [np.nan],
 'person_attributes': [np.nan],
 'seat': ['front_left'],
 'other_person_location': [np.nan],
 'other_factor_1': [np.nan],
 'other_factor_2': [np.nan],
 'other_factor_3': [np.nan],
 'age_in_years': [np.nan]}).astype(df_x.dtypes)

## Overall pipeline

In [161]:
best_forest_parameters = {'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 9,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 9,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


best_forest = RandomForestClassifier(**best_forest_parameters)

In [162]:
pipeline = make_pipeline(
    DataCleaner(30),
    Imputer(strategy='mean'),
    RandomForestClassifier(**best_forest_parameters)
)
pipeline.fit(df_x, y)

Pipeline(memory=None,
     steps=[('datacleaner', DataCleaner(n_min=30)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_node...estimators=9, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [164]:
%timeit pipeline.predict_proba(asd)

550 ms ± 36.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [166]:
pipeline.predict_proba(asd)[0,1]

0.36411787800816947

## Serialize!!

In [168]:
#column names
with open('columns.json', 'w') as fh:
    json.dump(df_x.columns.tolist(), fh)

In [169]:
#column types (don't forget to convert first age to np.float because int can't be nan)
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(df_x.dtypes, fh)

In [170]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'pipeline.pickle') 

['pipeline.pickle']

In [174]:
__name__

'__main__'