# Notebook used to help create:

- importing data src
- precprocessing data src

In [None]:
%pwd

In [1]:
import pandas as pd
import numpy as np
from src.data.io import import_pickled_feature_dfs, import_merge_prevday_target_column, import_merge_pickled_target_class

# Import data

In [2]:
features = import_pickled_feature_dfs('../../data/interim/D2_timeseries/', ['EDsummary.pkl', 'callender.pkl'])

In [3]:
features.shape

(2465, 7)

In [4]:
target = pd.read_pickle('../../data/interim/EDooc_class.pkl')

In [5]:
features = target.merge(features, right_index=True, left_index=True)

# Remeber to OFFSET target!

In [6]:
features_final = features.copy()
features_final['flag_target'] = features_final['flag_target'].shift(-1)

In [7]:
features_final.flag_target.value_counts()

0.0    2094
1.0     370
Name: flag_target, dtype: int64

In [8]:
features_final.tail()

Unnamed: 0_level_0,EDocc,flag_target,attendances,admissions,age_mean,age_65plus,year,month,dayofweek
dt_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-26,59.791667,1.0,487,188,44.7064,134,2018,9,Wednesday
2018-09-27,63.25,0.0,482,168,44.1992,126,2018,9,Thursday
2018-09-28,56.541667,0.0,463,191,46.4989,138,2018,9,Friday
2018-09-29,44.125,0.0,367,157,40.3597,79,2018,9,Saturday
2018-09-30,55.541667,,454,174,41.5308,110,2018,9,Sunday


# Processing

#### split test and train sets

In [9]:
from src.data.preprocess import make_timeseries_test_train_splits

In [10]:
X_train, X_test, y_train, y_test = make_timeseries_test_train_splits(features_final, 'flag_target', 365)

DATA POINTS:
orig size 2465
training:  2100
testing:  365


In [19]:
from src.data.preprocess import get_variable_types_lists, check_for_catagorical_type_difference_between_train_test, change_feature_types_to_numeric
    

# num_features, cat_features, bin_features = get_variable_types_lists(X_train)

In [20]:
X_train.columns

Index(['EDocc', 'attendances', 'admissions', 'age_mean', 'age_65plus', 'year',
       'month', 'dayofweek'],
      dtype='object')

In [26]:
cat_features = ['month','dayofweek']
num_features = list(set(X_train.columns) - set(cat_features))
bin_features = []

#### find problem columns that will crash preprocessing pipleine when converting test data

In [27]:
problem_col_list = check_for_catagorical_type_difference_between_train_test(X_train, X_test, cat_features)

Treat problem features as numerical so that process can continue.


- use sklearn pipelines to clean


In [28]:
# version 19.2
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from src.data.preprocess import DataFrameSelector, MakeBooleanAnInteger

for col_name in cat_features:
    values_list = X_train[col_name].unique()
    for value in values_list:
        print(col_name + '_' + str(value))

In [29]:
#### failed to get this to work as kept doubling size of output array. Didnt matter if get_col_list_for_after_pipeline fundtion was inside or outside of DataFrameReform class.


from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameReform(BaseEstimator,TransformerMixin):
    " Takes numpy array and forms into dataframe with column names."
    def __init__(self, new_features_list):
        self.new_features_list = new_features_list
    def fit(self, X, y=None):
        return self
    def transform(self,X):        
        return(pd.DataFrame(X, columns = self.new_features_list))

In [31]:
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_features)),
    ('feature_filter',SelectKBest(f_classif,k='all')),
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_features)),
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('OneHot_encoder',OneHotEncoder(sparse=False))
])

bin_pipeline = Pipeline([
    ('selector',DataFrameSelector(bin_features)),
    ('boolean_conversion',MakeBooleanAnInteger()),
    ('imputer',SimpleImputer(strategy='most_frequent')),
    #('encoding',LabelEncoder()),
])

# df_reform_pipeline = Pipeline([
#     ('reform_df', DataFrameReform(new_features_list))
# ])

#### create list of pipelines to include

pipes_list = [
    ('num_pipeline',num_pipeline),
  ('cat_pipeline',cat_pipeline),
#   ('bin_pipeline',bin_pipeline),
#     ('df_reform_pipeline', df_reform_pipeline)
]

full_pipeline = FeatureUnion(transformer_list=pipes_list
)


X_trainT = full_pipeline.fit_transform(X_train,y_train)
X_testT = full_pipeline.transform(X_test)

In [32]:
X_train.shape

(2100, 8)

In [33]:
X_trainT.shape

(2100, 25)

In [34]:
def get_col_list_for_after_pipeline(X, num_features, cat_features, bin_features):
    cat_features_new = []
    #### get new names for cat_features - must do here, otherwise when call fit transform the cat_features_new is appedned too twice.
#     print('TRANSFORM METHOD')
    for col_name in cat_features:
#         print('CAT FEATURES LOOP', col_name)
        values_list = X[col_name].unique()
        for value in values_list:
            cat_features_new.append(col_name + '_' + str(value))
#         print(cat_features_new)
    
    columns_list = num_features + cat_features_new + bin_features
#     print(len(columns_list))
    return(columns_list)

new_features_list = get_col_list_for_after_pipeline(X_train, num_features, cat_features, bin_features)

In [35]:
def reform_df(X, new_features_list):
    return(pd.DataFrame(X, columns= new_features_list))

X_trainT = reform_df(X_trainT, new_features_list)
X_testT = reform_df(X_testT, new_features_list)

In [36]:
X_testT.shape

(365, 25)

In [37]:
X_trainT.shape

(2100, 25)

In [38]:
X_testT.shape

(365, 25)

# Save preprocessed out as pkl

In [41]:
def save_pickle_preprocessed_data(path, X_train, X_test, y_train, y_test):
    """
    Take prepared data which has been split into Train/Test and has been scaled/blanks filled/...., and save to pickle files at specified location.
    
    Input
    =====
    path, str, to folder where data should be saved. Must end in /

    X_train/X_test/y_train,/y_test, dataframes, conatining data.
    
    Ouput
    =====
    
    """
    
    #### create folder with versioned name etc.(future dev)
    
    pd.to_pickle(X_train, path + 'X_train.pkl')
    pd.to_pickle(X_test, path + 'X_test.pkl')
    pd.to_pickle(y_train, path + 'y_train.pkl')
    pd.to_pickle(y_test, path + 'y_test.pkl')
    
    return

save_pickle_preprocessed_data('../../data/processed/D2/', X_trainT, X_testT, y_train, y_test)

NOTE: consider creating log for preprocessing information in order for repeatability in future.