# Notebook used to help create:

- importing data src
- precprocessing data src

In [1]:
%pwd

'C:\\Users\\bjk1y13\\OneDrive - University of Southampton\\MH028_UHS_Weather\\4_Analysis\\EDclassification\\notebooks'

In [2]:
import pandas as pd
import numpy as np
from src.data.io import import_pickled_feature_dfs, import_merge_prevday_target_column, import_merge_pickled_target_class

# Import data

In [3]:
features = import_pickled_feature_dfs('../data/interim/', ['callender.pkl', 'EDmorn.pkl', 'EDprevday.pkl', 'IPocc_perc.pkl', 'IPmorn.pkl'])

In [4]:
features = import_merge_prevday_target_column('../data/interim/', 'EDooc_class.pkl', features)

In [5]:
features = import_merge_pickled_target_class('../data/interim/', 'EDooc_class.pkl', features)

In [6]:
features.head()

Unnamed: 0,flag_target,EDoccMAX_prevday,callender_DAY(first_EDatt_time),callender_MONTH(first_EDatt_time),callender_YEAR(first_EDatt_time),callender_WEEKEND(first_EDatt_time),EDmorn_COUNT(EDatt),EDmorn_MEAN(EDatt.wait_time_total),EDmorn_MEAN(EDatt.flag_specreq),EDmorn_MEAN(EDatt.age),...,IPocc_perc,IP_admissions_elec_nonelec,IP_discharges_elec_nonelec_PRE12,IPmorn_count,IPmorn_count_elec,IPmorn_count_nonelec,IPmorn_age_mean,IPmorn_age_std,IPmorn_age_skew,IPmorn_prop_nonelec
2013-01-02,0,43.583333,2,1,2013,False,24,104.916667,0.833333,45.25,...,0.745533,122.0,39.0,1069.0,97.0,955.0,65.192703,24.748781,-1.113181,0.893358
2013-01-03,0,45.25,3,1,2013,False,32,157.272727,0.727273,50.78125,...,0.857756,210.0,39.0,1137.0,151.0,970.0,64.325418,24.896858,-1.067481,0.853122
2013-01-04,0,50.708333,4,1,2013,False,31,176.916667,0.666667,36.645161,...,0.909221,181.0,70.0,1141.0,156.0,973.0,62.812445,25.652021,-0.942528,0.852761
2013-01-05,0,38.666667,5,1,2013,True,30,160.217391,0.695652,39.866667,...,0.897069,149.0,50.0,1104.0,150.0,936.0,62.668478,25.448019,-0.917823,0.847826
2013-01-06,0,42.25,6,1,2013,True,41,171.045455,0.590909,46.682927,...,0.819871,136.0,36.0,1121.0,136.0,975.0,62.371097,25.65843,-0.900726,0.869759


In [7]:
features.flag_target.value_counts()

0    1713
1     306
Name: flag_target, dtype: int64

# Processing

#### split test and train sets

In [8]:
from src.data.preprocess import make_timeseries_test_train_splits

In [9]:
X_train, X_test, y_train, y_test = make_timeseries_test_train_splits(features, 'flag_target', 365)

DATA POINTS:
orig size 2019
training:  1654
testing:  365


In [10]:
from src.data.preprocess import get_variable_types_lists, check_for_catagorical_type_difference_between_train_test, change_feature_types_to_numeric
    

num_features, cat_features, bin_features = get_variable_types_lists(X_train)

#### find problem columns that will crash preprocessing pipleine when converting test data

In [11]:
problem_col_list = check_for_catagorical_type_difference_between_train_test(X_train, X_test, cat_features)

Feature name:  callender_YEAR(first_EDatt_time)
Categories in training:  {2016, 2017, 2013, 2014, 2015}
Categories in testing:  {2017, 2018}



Treat problem features as numerical so that process can continue.


In [12]:
cat_features, num_features = change_feature_types_to_numeric(problem_col_list, cat_features, num_features)

- use sklearn pipelines to clean


In [13]:
# version 19.2
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from src.data.preprocess import DataFrameSelector, MakeBooleanAnInteger

for col_name in cat_features:
    values_list = X_train[col_name].unique()
    for value in values_list:
        print(col_name + '_' + str(value))

In [30]:
#### failed to get this to work as kept doubling size of output array. Didnt matter if get_col_list_for_after_pipeline fundtion was inside or outside of DataFrameReform class.


from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameReform(BaseEstimator,TransformerMixin):
    " Takes numpy array and forms into dataframe with column names."
    def __init__(self, new_features_list):
        self.new_features_list = new_features_list
    def fit(self, X, y=None):
        return self
    def transform(self,X):        
        return(pd.DataFrame(X, columns = self.new_features_list))

In [31]:
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_features)),
    ('feature_filter',SelectKBest(f_classif,k='all')),
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_features)),
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('OneHot_encoder',OneHotEncoder(sparse=False))
])

bin_pipeline = Pipeline([
    ('selector',DataFrameSelector(bin_features)),
    ('boolean_conversion',MakeBooleanAnInteger()),
    ('imputer',SimpleImputer(strategy='most_frequent')),
    #('encoding',LabelEncoder()),
])

df_reform_pipeline = Pipeline([
    ('reform_df', DataFrameReform(new_features_list))
])

#### create list of pipelines to include

pipes_list = [
    ('num_pipeline',num_pipeline),
  ('cat_pipeline',cat_pipeline),
  ('bin_pipeline',bin_pipeline),
#     ('df_reform_pipeline', df_reform_pipeline)
]

full_pipeline = FeatureUnion(transformer_list=pipes_list
)


X_trainT = full_pipeline.fit_transform(X_train,y_train)
X_testT = full_pipeline.transform(X_test)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [32]:
X_train.shape

(1654, 41)

In [33]:
X_trainT.shape

(1654, 52)

In [34]:
def get_col_list_for_after_pipeline(X, num_features, cat_features, bin_features):
    cat_features_new = []
    #### get new names for cat_features - must do here, otherwise when call fit transform the cat_features_new is appedned too twice.
#     print('TRANSFORM METHOD')
    for col_name in cat_features:
#         print('CAT FEATURES LOOP', col_name)
        values_list = X[col_name].unique()
        for value in values_list:
            cat_features_new.append(col_name + '_' + str(value))
#         print(cat_features_new)
    
    columns_list = num_features + cat_features_new + bin_features
#     print(len(columns_list))
    return(columns_list)

new_features_list = get_col_list_for_after_pipeline(X_train, num_features, cat_features, bin_features)

In [35]:
def reform_df(X, new_features_list):
    return(pd.DataFrame(X, columns= new_features_list))

X_trainT = reform_df(X_trainT, new_features_list)
X_testT = reform_df(X_testT, new_features_list)

In [36]:
X_testT.shape

(365, 52)

In [39]:
X_trainT.shape

(1654, 52)

In [40]:
X_testT.shape

(365, 52)

# Save preprocessed out as pkl

In [43]:
def save_pickle_preprocessed_data(path, X_train, X_test, y_train, y_test):
    """
    Take prepared data which has been split into Train/Test and has been scaled/blanks filled/...., and save to pickle files at specified location.
    
    Input
    =====
    path, str, to folder where data should be saved. Must end in /

    X_train/X_test/y_train,/y_test, dataframes, conatining data.
    
    Ouput
    =====
    
    """
    
    #### create folder with versioned name etc.(future dev)
    
    pd.to_pickle(X_train, path + 'X_train.pkl')
    pd.to_pickle(X_test, path + 'X_test.pkl')
    pd.to_pickle(y_train, path + 'y_train.pkl')
    pd.to_pickle(y_test, path + 'y_test.pkl')
    
    return

save_pickle_preprocessed_data('../data/processed/v1/', X_trainT, X_testT, y_train, y_test)

In [None]:
pd.read_pickle

NOTE: consider creating log for preprocessing information in order for repeatability in future.