In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [5]:
def cleanup1(X):
    """
    Minimal viable cleaning.
    
    This function gets the data in minimal working order for a logistic 
    regression. I fill up NANs (which appear only in the categorcial
    features), change datetime objects to numbers, drop one useless 
    feature and standardize the datatypes.
    
    Parameters
    ----------
    X : pandas.DataFrame (DF)
        Original, full-featured DF (train_features or test_features)
    
    Returns
    ----------
    X2 : pandas.DataFrame
        Cleaned DF
    """
 
    # Make a clean copy, to ensure we're not changing the original DF
    X2 = X.copy()
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X2 = X2.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from ~2002, and the
    # latest from 2013.)
    dates = pd.to_datetime(X2.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X2.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X2.region_code = X2.region_code.astype('str')
    X2.district_code = X2.district_code.astype('str')
    
    # recorded_by has only one value everywhere, and is therefore useless
    X2 = X2.drop(columns='recorded_by')
    
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, which created a problem for later
    # operations that don't like heterogeneous datatypes within a single
    # column. I'll prevent this problem by casting those two features as str.
    
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X2 = X2.astype(dtype = type_dict)
    
    return X2

In [6]:
from sklearn.impute import MissingIndicator

def cleanup2(X):
    """
    Fixes the numerical features. 
    
    
    Each feature has different specific problems, but they usually have
    garbage values (usually zero) that should really be read as NANs.
    
    I want to fix those values, but I also want to take note of the 
    datapoints where they happened.  I do this because I assume that 
    missing values tell us something about the well that our model
    might be able to pick up later.
    
    This function removes the numerical features from the dataset and 
    makes two copies of them: num_fixed and num_nulls. In null_fixed,
    I will replace the garbage values with something better (usually the
    mean for that whole feature). In num_nulls, I will replace the 
    garbage values with NANs.  
    
    I'll then use MissingIndicator to turn num_nulls into a DF 
    containing a 1 at each location where a NAN was found. This DF
    will be called num_trashmarker
    
    I'll then go back to the original dataset, and add in both 
    null_fixed and num_indicator where the original numerical columns
    used to be.
    
    
    Parameters
    ----------
    X : pandas.DataFrame
        DF with raw numerical features
    
    Returns
    ----------
    X2 : pandas.DataFrame
         DF with cleaned numerical features and a new matrix of former
         garbage locations within those features.
    
    """
    
    
    # Make a clean copy, to ensure we're not changing the original DF
    X2 = X.copy()
    
    # Make a list of numerical column names
    num_names = X2.select_dtypes('number').columns.tolist()
    
    # Make two copies of the numerical columns
    num_fixed = X2[num_names].copy()
    num_nulls = X2[num_names].copy()
    
    # ---------------------------------------------------------------
    # For each numerical feature, fix it in num_fixed and add a NAN
    # in num_nulls.  Note how, when I use a feature's mean as the 
    # fill-in value, I calculate that mean on the feature where
    # garbage values have already been turned into NANs, so that 
    # the garbage doesn't affect the mean.
    
    # Longitudes of 0 are trash
    i = 'longitude'
    trash = 0
    num_nulls[i] = num_nulls[i].replace(trash, np.nan)
    mean = num_nulls[i].mean()
    num_fixed[i] = num_fixed[i].replace(trash, mean)
    
    # Latitudes of -2.000000e-08 are trash
    i = 'latitude'
    trash = -2.000000e-08
    num_nulls[i] = num_nulls[i].replace(trash, np.nan)
    mean = num_nulls[i].mean()
    num_fixed[i] = num_fixed[i].replace(trash, mean)
    
    # I don't know what num_private is supposed to mean, but there sure are
    # a lot of zero values.  Those tend to be garbage in other features, so
    # I'll mark them as if they were NANs in num_nulls just in case that 
    # means something.  I won't change them to something else, though.
    i = 'num_private'
    trash = 0
    num_nulls[i] = num_nulls[i].replace(trash, np.nan)

    # I bet that population=0 could be a legitimate value, but it's also 
    # susupicious. I'll mark those rows as if they were NANs in num_nulls 
    # just in case, but I won't change them to something else.
    i = 'population'
    trash = 0
    num_nulls[i] = num_nulls[i].replace(trash, np.nan)
    
    # construction_year values of zero are garbage.  I'll replace them with 
    # the earliest year in the list (1960) and change all the values to the
    # number of years since then. I bet that assuming these pumps are older
    # is better than assuming they are of average age.
    i = 'construction_year'
    trash = 0
    num_nulls[i] = num_nulls[i].replace(trash, np.nan)
    num_fixed[i] = num_fixed[i].replace(0.0, 1960)
    num_fixed[i] = num_fixed[i] - 1960.0
       
    # --------------------------------------------------------------- 
    # Create indicator columns that mark the locations of all the NANs 
    # in the numerical columns, and add back to the full DF. Note
    # that MissingIndicator returns a numpy array.

    indicator = MissingIndicator()
    trash_array = indicator.fit_transform(num_nulls) # Bool array
    trash_array = trash_array.astype('float64')     # Float64 array
    
    # Create a titles for the columns in num_trashmarker
    num_names_trashy = [num_names[i] + '_trash' for i in indicator.features_]
    
    # Create num_trashmarker
    num_trashmarker = pd.DataFrame(trash_array, columns=num_names_trashy)
    
    # Drop the numerical columns from X2, replace them with the fixed 
    # ones, and add the trash markers.
    X2 = X2.drop(columns=num_names)
    X2 = pd.concat([num_fixed, num_trashmarker, X2], sort=False, axis=1)
    
    return X2

In [7]:
# X_train is the matrix of features that will go into the logistic regression.
# It exists at various points as a dataframe or numpy array
X_train = cleanup2(cleanup1(train_features))
y_train = train_labels['status_group']

In [None]:
# This command produces a series of the categorical features, calculates their cardinality
# (number of unique values), sorts the features by cardinality, extracts the feature names
# (indices), turns those indexes into a list, and takes all but the 6 with highest cardinality. 
cols_to_keep = X_train.select_dtypes(exclude='number').nunique().sort_values().index.tolist()[:-6]
cols_to_keep

In [None]:
# X_train = X_train.drop(columns=cols_to_drop)
X_train = X_train[cols_to_keep]

In [None]:
# In this cell I define a pipeline that will scale and one-hot encode X_train, then
# feed it to the logistic regression.

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper

# Get lists of numerical and categorical features
numerical_cols = X_train.select_dtypes(include='number').columns.tolist()
categorical_cols = X_train.select_dtypes(exclude='number').columns.tolist()

# Use a mapper to apply different transformations to the numerical and
# categorical features
mapper = DataFrameMapper(
  [([col], RobustScaler()) for col in numerical_cols] +
  [([col], OneHotEncoder(categories='auto')) for col in categorical_cols]
)

# Wrap it all in a pipeline.  The parameters of the regression were chosen by 
# trial and error with GridSearchCV in a separate notebook.
pipe = make_pipeline(
    mapper, 
    LogisticRegression(solver='lbfgs', multi_class='ovr',
                      max_iter=500))

In [None]:
%%time
pipe.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = pipe.predict(X_train)
accuracy_score(y_train, y_pred)