In [24]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

In [2]:
# Read in the cleaned data.
df_train = pd.read_csv('./Datasets/clean_train_data.csv')
df_test = pd.read_csv('./Datasets/clean_test_data.csv') 

In [3]:
# Check shape of each data table.
df_train.shape, df_test.shape

((59400, 27), (14850, 26))

In [4]:
# Find columns with 'object' dtypes.
cols = df_train.select_dtypes(exclude=[np.number])

In [6]:
# Show list of object columns.
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [7]:
# Exclude object columns. Find non-object columns.
cols2 = df_train.select_dtypes(exclude=[np.object])

In [8]:
# Show list of numeric columns.
list(cols2)

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population']

In [22]:
# Change any columns of strings in a panda's dataframe to a column of
# categorical values. This applies the changes inplace
def train_cats(df):
     for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [25]:
train_cats(df_train)

In [26]:
df_train.dtypes.unique()

array([dtype('int64'), dtype('float64'),
       CategoricalDtype(categories=['2002-10-14', '2004-01-07', '2004-03-01', '2004-03-06',
                  '2004-04-01', '2004-04-05', '2004-05-01', '2004-06-01',
                  '2004-07-01', '2004-08-01',
                  ...
                  '2013-08-03', '2013-09-02', '2013-09-03', '2013-10-02',
                  '2013-10-03', '2013-11-02', '2013-11-03', '2013-12-01',
                  '2013-12-02', '2013-12-03'],
                 ordered=True),
       CategoricalDtype(categories=['24', 'A Kulwa', 'A Saidi', 'Abass', 'Abbas',
                  'Abdala Hamisi', 'Abdala Mwandute', 'Abdalaa', 'Abdalah',
                  'Abdalah Ali',
                  ...
                  'Zuieni', 'Zuleha', 'Zumba', 'Zumbawanu',
                  'Zumbawanu Shuleni', 'Zungu', 'Zunguni', 'Zunzuli A Shuleni',
                  'Zuwena Kindo', 'none'],
                 ordered=True),
       CategoricalDtype(categories=['Internal', 'Lake Nyasa', 'Lake Ru

In [19]:
# Create dummy values.
dummy_df = pd.get_dummies(df_train, columns = ['wpt_name',
                                               'basin',
                                               'region',
                                               'lga',
                                               'ward',
                                               'construction_year',
                                               'extraction_type_group',
                                               'extraction_type_class',
                                               'management',
                                               'management_group',
                                               'payment',
                                               'quality_group',
                                               'quantity',
                                               'source',
                                               'source_class',
                                               'waterpoint_type'])

In [20]:
# Check independent variable. Make sure values are correct.
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [15]:
# Check data types of new dummy dataframe to make sure categorical values are gone.
dummy_df.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O'), dtype('uint8')],
      dtype=object)

id                                               int64
amount_tsh                                     float64
date_recorded                                   object
gps_height                                       int64
longitude                                      float64
latitude                                       float64
num_private                                      int64
region_code                                      int64
district_code                                    int64
lga                                             object
ward                                            object
population                                       int64
status_group                                    object
wpt_name_24                                      uint8
wpt_name_A Kulwa                                 uint8
wpt_name_A Saidi                                 uint8
wpt_name_Abass                                   uint8
wpt_name_Abbas                                   uint8
wpt_name_A