In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [26]:
def cleanup1(X):
    """
    cleanup1 gets the data in minimal working order for a logistic regression. I fill
    up NANs, change datetime objects to numbers, drop one useless feature and 
    standardize the datatypes.
    
    >> Input
    X: Original, full-featured dataset (train_features or test_features)
    
    >> Output
    X2: Cleaned dataset ready for logistic regression
    """
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X2 = X.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from ~2002, and the
    # latest from 2013.)
    dates = pd.to_datetime(X2.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X2.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X2.region_code = X2.region_code.astype('str')
    X2.district_code = X2.district_code.astype('str')
    
    # recorded_by has only one value everywhere, and is therefore useless
    X2 = X2.drop(columns='recorded_by')
    
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, which created a problem for later
    # operations that don't like heterogeneous datatypes within a single
    # column. I'll prevent this problem by casting those two features as str.
    
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X2 = X2.astype(dtype = type_dict)
    
    return X2

In [12]:
# X_train is the matrix of features that will go into the logistic regression.
# It exists at various points as a dataframe or numpy array
X_train = cleanup1(train_features)
y_train = train_labels['status_group']

In [25]:
# This command produces a series of the categorical features, calculates their cardinality
# (number of unique values), sorts the features by cardinality, extracts the feature names
# (indices), turns those indexes into a list, and takes all but the 6 with highest cardinality. 
cols_to_keep = X_train.select_dtypes(exclude='number').nunique().sort_values().index.tolist()[:-6]
cols_to_keep

['source_class',
 'permit',
 'public_meeting',
 'management_group',
 'quantity_group',
 'quantity',
 'quality_group',
 'waterpoint_type_group',
 'source_type',
 'payment',
 'payment_type',
 'waterpoint_type']

In [22]:
# X_train = X_train.drop(columns=cols_to_drop)
X_train = X_train[cols_to_keep]

In [17]:
# In this cell I define a pipeline that will scale and one-hot encode X_train, then
# feed it to the logistic regression.

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper

# Get lists of numerical and categorical features
numerical_cols = X_train.select_dtypes(include='number').columns.tolist()
categorical_cols = X_train.select_dtypes(exclude='number').columns.tolist()

# Use a mapper to apply different transformations to the numerical and
# categorical features
mapper = DataFrameMapper(
  [([col], RobustScaler()) for col in numerical_cols] +
  [([col], OneHotEncoder(categories='auto')) for col in categorical_cols]
)

# Wrap it all in a pipeline.  The parameters of the regression were chosen by 
# trial and error with GridSearchCV in a separate notebook.
pipe = make_pipeline(
    mapper, 
    LogisticRegression(solver='lbfgs', multi_class='ovr',
                      max_iter=500))

In [18]:
%%time
pipe.fit(X_train,y_train)

CPU times: user 1min 20s, sys: 573 ms, total: 1min 21s
Wall time: 42.5 s


Pipeline(memory=None,
     steps=[('dataframemapper', DataFrameMapper(default=False, df_out=False,
        features=[(['source_class'], OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)), (['permit'], OneHotEncoder(categ...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [20]:
from sklearn.metrics import accuracy_score
y_pred = pipe.predict(X_train)
accuracy_score(y_train, y_pred)

0.750016835016835