In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data import WellProcessor, load_data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [16]:
class WellProcessor:
    """
    Takes data from the Tanzanian well dataset and processes it for modeling.  This includes: 
    2. Imputing missing data for 'permit' and 'construction_date' 
        as well as transforming 'date_recorded' into a float.
    2. Scaling numeric data
    3. One-hot-encoding categorical data
    
    Methods:
    fit(X): Fit all transformers on a dataset.  Returns None.
    transform(X): Use fitted transformers to transform data.  Returns a dataframe.
    fit_transform(X): Fit and transform data using transformers.  Returns a dataframe.
    """
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        """
        fit tranformer on data.  Does not transform data.
        """
        self.cat_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
        self.num_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
        self.ohe = OneHotEncoder(categories = 'auto', sparse = False, dtype = int, handle_unknown = 'ignore')
        self.scaler = StandardScaler()
        
        #create a dummy dataframe to fit correctly (date_recorded will be a float instead of a string in the
        #transform, so it needs to be that way in the fit.  Does NOT change the original dataframe passed.
        cleanX = X.copy()
        if ('date_recorded' in cleanX.columns) and (cleanX['date_recorded'].dtype == 'object'):
            cleanX['date_recorded'] = cleanX[['date_recorded']].applymap(lambda year: round(int(year.split(sep='-')[0]) + int(year.split(sep='-')[1])/12,2))
            
        clean_X_cat = self.cat_imputer.fit_transform(cleanX.select_dtypes(include = 'object'))
            
        clean_X_num = self.num_imputer.fit_transform(cleanX.select_dtypes(include = 'number'))
            
        self.scaler.fit(clean_X_num)
        self.ohe.fit(clean_X_cat)
        
    def transform(self,X,y=None):
        """
        transforms data by imputing missing values, scaling numeric features, and one-hot encoding categorical
        feature.  Returns a transformed dataframe using the previously fitted transformers.
        """
        
            #process dates into floats
        if ('date_recorded' in X.columns) and (X['date_recorded'].dtype == 'object'):
            X['date_recorded'] = X[['date_recorded']].applymap(lambda year: round(int(year.split(sep='-')[0]) + \
                                                            int(year.split(sep='-')[1])/12,2))
    
        #impute missing data
        
        X_cat = X.select_dtypes(include = 'object')
        X_cat_imp = pd.DataFrame(self.cat_imputer.transform(X_cat),
                             index = X_cat.index,
                             columns = X_cat.columns)
        
        X_num = X.select_dtypes(include = 'number')
        X_num_imp = pd.DataFrame(self.num_imputer.transform(X_num),
                     index = X_num.index,
                     columns = X_num.columns)
                                                            
        # One-hot encode categorical variables
        X_hot = pd.DataFrame(self.ohe.transform(X_cat_imp), 
                            index = X_cat_imp.index, 
                            columns = self.ohe.get_feature_names(X_cat_imp.columns)
                            )
        
        # Scale numeric variables
        X_num_ss = pd.DataFrame(self.scaler.transform(X_num_imp), 
                                index = X_num_imp.index, 
                                columns = X_num_imp.columns)
        
        # Return tranformed dataframe
        return pd.concat([X_num_ss, X_hot], axis = 1)
    
    def fit_transform(self,X,y=None):
        """
        Fits tranformer to data AND returns transformed dataframe.  DO NOT USE ON TESTING OR VALIDATION DATA! 
        """
        #create transformer objects
        self.cat_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
        self.num_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
        self.ohe = OneHotEncoder(categories = 'auto', sparse = False, dtype = int, handle_unknown = 'ignore')
        self.scaler = StandardScaler()
    
            #process dates into floats
        if ('date_recorded' in X.columns) and (X['date_recorded'].dtype == 'object'):
            X['date_recorded'] = X[['date_recorded']].applymap(lambda year: round(int(year.split(sep='-')[0]) + \
                                                            int(year.split(sep='-')[1])/12,2))
        print(X['date_recorded'].dtype)
    
        #impute missing data
        
        X_cat = X.select_dtypes(include = 'object')
        X_cat_imp = pd.DataFrame(self.cat_imputer.fit_transform(X_cat),
                             index = X_cat.index,
                             columns = X_cat.columns)
        
        X_num = X.select_dtypes(include = 'number')
        X_num_imp = pd.DataFrame(self.num_imputer.fit_transform(X_num),
                     index = X_num.index,
                     columns = X_num.columns)
                                                            
        # One-hot encode categorical variables
        X_hot = pd.DataFrame(self.ohe.fit_transform(X_cat_imp), 
                            index = X_cat_imp.index, 
                            columns = self.ohe.get_feature_names(X_cat_imp.columns)
                            )
        
        # Scale numeric variables
        X_num_ss = pd.DataFrame(self.scaler.fit_transform(X_num_imp), 
                                index = X_num_imp.index, 
                                columns = X_num_imp.columns)
        
        # Return tranformed dataframe
        return pd.concat([X_num_ss, X_hot], axis = 1)

In [17]:
X, y = load_data()
wp = WellProcessor()
df = wp.fit_transform(X)
df['target'] = y['status_group']
df.head()

float64


Unnamed: 0_level_0,amount_tsh,date_recorded,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,1.847327,-1.133756,0.638965,-0.0807,-1.408791,-0.024044,-0.244325,-0.065494,-0.273835,0.105584,...,0,0,0,0,1,0,0,0,0,2
8776,-0.081479,1.054946,0.656927,-0.173915,1.207934,-0.024044,0.267409,-0.376921,0.101064,1.186187,...,0,0,0,0,1,0,0,0,0,2
34310,-0.156954,0.967398,-0.766019,0.901822,0.639751,-0.024044,0.324269,-0.169303,0.035292,1.08795,...,0,0,0,0,1,0,0,0,0,2
67743,-0.081479,0.868906,-1.610208,1.301245,-1.84972,-0.024044,4.247564,5.955432,-0.385647,-1.171491,...,0,0,0,0,1,0,0,0,0,0
19728,-0.081479,-0.77262,0.19392,-1.563592,1.317271,-0.024044,0.153691,-0.48073,-0.183947,0.203821,...,0,0,0,0,1,0,0,0,0,2


In [8]:
df['age'] = df['date_recorded'] - df['construction_year']
df.head()

KeyError: 'date_recorded'