In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import RobustScaler
from category_encoders.target_encoder import TargetEncoder
pd.set_option('display.max_columns', None)  # Unlimited columns

In [2]:
pd.__version__

'0.23.0'

In [10]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
X_test = pd.read_csv('original_data/test_features.csv')
X_train = pd.read_csv('original_data/train_features.csv')
y_train = pd.read_csv('original_data/train_labels.csv')

# Data cleanup
This time around, I'll make a single cleanup function that does all the work.  I also plan to use XGBoost and other classifiers that can handle null values, so I'll only get rid of them when I can replace them with more meaningful stuff.

In [4]:
def wrangle(X):
    """
    All-inclusive data cleanup.
    """
 
    # Make a clean copy, to ensure we're not changing the original DF
    X = X.copy()
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X = X.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from ~2002, and the
    # latest from 2013.)
    dates = pd.to_datetime(X.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X.region_code = X.region_code.astype('str')
    X.district_code = X.district_code.astype('str')
      
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, which created a problem for later
    # operations that don't like heterogeneous datatypes within a single
    # column. I'll prevent this problem by casting those two features as str.
    
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X = X.astype(dtype = type_dict)
    
    # Fixing the numerical columns.
    # ---------------------------------------------------------------    
    
    # Numerical columns have several kinds of garbage values that 
    # Must be replaced with nulls.
    numericals = ['amount_tsh',
                    'date_recorded',
                    'gps_height',
                    'longitude',
                    'latitude',
                    'num_private',
                    'population',
                    'construction_year']

    null_values = {'amount_tsh':0,
                     'date_recorded':0,
                     'gps_height':0,
                     'longitude':0,
                     'latitude':-2.000000e-08,
                     'num_private':0,
                     'population':0,
                     'construction_year':0}

    # I replace all garbage values with NANs.
    for feature, null in null_values.items():
        X[feature] = X[feature].replace(null, np.nan)
    
    # construction_year occasionally claims years far in the future, and 
    # could presumably also contain years way in the past.  I'll turn anything
    # not between 1960 and 2019 into a NAN.
    X['construction_year'] = [i if 1960 < i < 2019 else np.nan for i in X['construction_year']]

    
    # Whenever available, a good replacement value for a NAN is the 
    # mean or median value for the geographic region around it.

    # Replaces the NANs in a ward with the mean of the other rows in that 
    # same ward. If all the rows in a ward are NANs, though, they remain.
    for feature in numericals:
        replacements = X.groupby('ward')[feature].transform('mean')
        X[feature] = X[feature].fillna(replacements)

    # Replaces the NANs in a region with the mean of the other rows in that 
    # same region (which are much larger than wards)
    for feature in numericals:
        replacements = X.groupby('region')[feature].transform('mean')
        X[feature] = X[feature].fillna(replacements)
    
    # A few rows are not helped by averaging over the ward or the region.
    # Those can stay as they are, with NANs.
    
    # Fixing the categorical columns.
    # ---------------------------------------------------------------    
    
    # Create list of categorical features
    categoricals = X.select_dtypes(exclude='number').columns.tolist()

    # Make all strings lowercase, to collapse together some of the categories
    X[categoricals] = X[categoricals].applymap(lambda x: x.lower())

    # Replace common NAN values
    nan_list = ['not known','unknown','none','-','##','not kno','unknown installer']
    X = X.replace(nan_list, np.nan)

    # Any feature values with fewer than 100 rows gets turned into a NAN
    for feature in X[categoricals]:
        # Determine which feature values to keep
        to_keep = X[feature].value_counts()[X[feature].value_counts() > 100].index.tolist()
        # Turn those into NANs (using a copy, to prevent warnings)
        feature_copy = X[feature].copy()
        feature_copy[~feature_copy.isin(to_keep)] = np.nan
        X[feature] = feature_copy
    
    
    garbage = ['extraction_type_group','extraction_type_class',
               'region_code','waterpoint_type_group','source_type',
              'payment_type','quality_group','quantity_group',
              'recorded_by']
    
    X = X.drop(columns=garbage)
    
    X['age'] = X['date_recorded'] - X['construction_year']
    
    return X

In [5]:
def target_encode_cats(X, X_train, y_train):
    """
    Target encodes the categorical features of a dataframe X,
    after training the encoder on all the data in X_train and
    creating a different encoded column for each feature label
    in y_train.
    
    Target encoding is designed to work with
    binary labels; in order to make it work with a vector that has three
    values, I target encode against a binary version of each and then
    concatenate the results.

    Parameters
    ----------
    X : pandas.DataFrame
        Dataset to be fixed
        
    X_train : Training data to train the encoder.

    y_train : pandas.DataFrame
                    The vector of training labels

    Returns
    ----------
    X : pandas.DataFrame
            Fixed vector

    """
    # Make a clean copy, to ensure we're not changing the original DF
    X = X.copy()
    
    # Create a list of categorical columns
    cats = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # Make binary lists of all the available labels
    y_true = y_train['status_group']
    y_works = [1.0 if x == 'functional' else 0.0 for x in y_true]
    y_broken = [1.0 if x == 'non functional' else 0.0 for x in y_true]
    y_repair = [1.0 if x == 'functional needs repair' else 0.0 for x in y_true]

    y_vectors = [y_works, y_broken, y_repair]
    X_TE_all = []

    # We want to create encoding based on X_train and y_train,
    # then apply this encoding to any vector X
    for i in [1,2,3]:
        # Make an encoder
        TE = TargetEncoder()
        
        # Fit it to the training data
        TE.fit(X=X_train[cats], y=y_vectors[i-1])

        # Transform the cat columns in X
        X_TE = TE.transform(X[cats])
        
        # Give them custom names, so that the columns encoded against
        # each target vector have a different name
        X_TE = X_TE.rename(columns=(lambda x: x + '_TE' + str(i)))
        X_TE_all.append(X_TE)

    new_cats = pd.concat(X_TE_all, sort=False, axis=1)
    
    X = X.drop(columns=cats)
    X = pd.concat([X,new_cats], sort=False, axis=1)
    
    return X

In [11]:
%%time
X_train = wrangle(X_train)
X_train = target_encode_cats(X_train, X_train, y_train)

CPU times: user 13.1 s, sys: 65.5 ms, total: 13.2 s
Wall time: 13.2 s


## Testing cleanup steps

In [33]:
from xgboost import XGBClassifier
modelxgb = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', 
                         nrounds = 'min.error.idx', num_class = 3, 
                         maximize = False, eval_metric = 'merror', eta = .1,
                         max_depth = 14, colsample_bytree = .4, n_jobs=-1)

y_true = y_train['status_group']

In [35]:
%%time
modelxgb.fit(X_train, y_true)

CPU times: user 2min 7s, sys: 249 ms, total: 2min 7s
Wall time: 1min 43s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, eta=0.1, eval_metric='merror', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=14, maximize=False,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
       nrounds='min.error.idx', nthread=None, num_class=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [30]:
%%time
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(modelxgb, X_train, y_true, scoring='accuracy', cv=5)

CPU times: user 18.7 s, sys: 4.75 s, total: 23.5 s
Wall time: 1min 28s


In [31]:
cv_score

array([0.81920714, 0.81424123, 0.81262626, 0.81203704, 0.81124768])

## DART

In [36]:
from xgboost import XGBClassifier
modelxgb = XGBClassifier(objective = 'multi:softmax', booster = 'dart', 
                         nrounds = 'min.error.idx', num_class = 3, 
                         maximize = False, eval_metric = 'merror', eta = .1,
                         max_depth = 14, colsample_bytree = .4, n_jobs=-1)

y_true = y_train['status_group']

In [37]:
%%time
modelxgb.fit(X_train, y_true)

CPU times: user 7min 16s, sys: 153 ms, total: 7min 16s
Wall time: 5min 59s


XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.4, eta=0.1, eval_metric='merror', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=14, maximize=False,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
       nrounds='min.error.idx', nthread=None, num_class=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

# Export

In [40]:
# Clean test data
X_test = wrangle(X_test)
X_test = target_encode_cats(X_test, X_train, y_train)

# Predict test data from the model
y_test_pred = modelxgb.predict(X_test)

# Make a dataframe with the answers
y_submit = pd.DataFrame({'id':test_features['id'],
                         'status_group':y_test_pred} )

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields funder, installer, wpt_name, basin, subvillage, region, district_code, lga, ward, public_meeting, scheme_management, scheme_name, permit, extraction_type, management, management_group, payment, water_quality, quantity, source, source_class, waterpoint_type

In [None]:
# make a submission CSV file
y_submit.to_csv('DMA6.csv', index=False)