In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import cos, sin

seed = 42
#!pip install category_encoders
#!pip install xgboost 
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from category_encoders.hashing import HashingEncoder
from category_encoders.binary import BinaryEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

# Import the Dataset & Test Set

In [2]:
#df = pd.read_csv("../input/train_features.csv", header=0)

df = pd.read_csv("train_features.csv", header=0)
df_test = pd.read_csv("test_features.csv",header=0)
df_labels = pd.read_csv("train_labels.csv", header=0)
df_labels['status_group'].value_counts()
df = df.merge(df_labels, on='id')



In [3]:
df.status_group.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

## Alternate Dataset - Drop Weakest Category

In [4]:
df_slim = df[df.status_group != 'functional needs repair']

In [5]:
df_slim.status_group.value_counts()

functional        32259
non functional    22824
Name: status_group, dtype: int64

## Check for NaNs

In [6]:
# Based on the most voted answer we can easily define a function that gives us a dataframe to preview the missing values and the % of missing values in each column:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [7]:
# Check out how many NaNs there are. 
missing_values_table(df)

Your selected dataframe has 41 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
scheme_name,28166,47.4
scheme_management,3877,6.5
installer,3655,6.2
funder,3635,6.1
public_meeting,3334,5.6
permit,3056,5.1
subvillage,371,0.6


In [22]:
# Check Value Counts for Columns
# Output Cleared
counts_ = []
features = list(df.columns)

fc = []
for x in features: fc.append([x, df[x].nunique()])
pd.DataFrame(data=fc, columns = ['features','counts']).sort_values(by=['counts']).set_index(['features'])

Unnamed: 0_level_0,counts
features,Unnamed: 1_level_1
recorded_by,1
public_meeting,2
permit,2
status_group,3
source_class,3
quantity_group,5
quantity,5
management_group,5
quality_group,6
waterpoint_type_group,6


In [10]:
# Encode my Y label and return a list of my labels. 
def labeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    dataframe =  dataframe.copy()
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1,2]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list


def slimlabeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    dataframe =  dataframe.copy()
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list


# Turn Lat/Long into x,y,z, coord plane. 
def lat_long(dataframe):
    from math import cos, sin 
    dataframe =  dataframe.copy()
    dataframe['x_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: cos(x))
    dataframe['y_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: sin(x))
    dataframe['z_coord'] = dataframe.latitude.apply(lambda x: sin(x))
    dataframe = dataframe.drop(columns=['latitude', 'longitude'])
    return dataframe

# Fix silly boolean issue. 
def no_bool(dataframe, columns):
    dataframe =  dataframe.copy()
    for column in columns:
        dataframe[column] = dataframe[column].replace({True: 'Yes', False: 'No'})
    return dataframe
        

In [None]:
# Code from previous attempt. May revive it. 
#for i in names:
#    temp_df.loc[temp_df[i].value_counts()[temp_df[i]].values < 20, i] = "RARE_VALUE"

In [None]:
# Now to decide how to handle the following madness:
"""
wpt_name	37400: These have some sites that encompass more than one well. 
date_recorded	356: DROP, missing data. 

# I'll deal with these ones after getting a new baseline. 
lga	125
funder	1897 - Trim?
    print(data.funder.value_counts()[data.funder.value_counts()>777])
    print(len(list(data.funder.value_counts()[data.funder.value_counts()<777])))
ward	2092 
installer	2145
scheme_name	2696 drop
subvillage	19287
""";

In [None]:
## This portion was tedious. I first looked at the .nunique counts for all of these then compared the value counts of each pair to select the features.
## Feature	value_count()

# DROP status_group	3 : 
# DROP id	59400: because duh. 
# DROP recorded_by	1 : Only one source. 

# KEEP public_meeting	2 : Boolean to object. 
# KEEP permit	2: Boolean to object.  

# KEEP source_class	3 
# KEEP source	10: These two are sufficiently different. 
# DROP source_type	7 : This one is source but with less detail. Not ordered. 

# DROP quantity_group	5 : These two are exactly the same. Drop either. 
# KEEP  quantity	5 : enough, insufficient, dry, seasonal, unknown  - possibly cat/ordinal

# KEEP management	12 : Most detailed, not ordered. 
# DROP management_group	5 : less-detailed, summary-grouping. Not ordered.
# DROP scheme_management	12 : Missing Values, similar info to management. 

# DROP payment	7
# KEEP payment_type	7 : Shorter Column headers. Identical otherwise.

# KEEP water_quality	8
# DROP quality_group	6

# DROP waterpoint_type_group	6
# KEEP waterpoint_type	7

# DROP extraction_type_class	7
# DROP extraction_type_group	13
# KEEP extraction_type	18


### GEOGRAPHY 
# DROP longitude	57516: Replace with X,Y,Z coordinates
# DROP latitude	57517

# KEEP basin	9 - Non-Regional boarder may overlap with some other indicators of location. 

# DROP region	21 
# KEEP region_code	27: Region Code contains more detail than Region. 

# DROP district_code	20
# KEEP lga	125 : Lga is district with urban/rural flag added in on some values. Lga is most detailed.

# ward	2092
# gps_height	2428
# subvillage	19287
# wpt_name	37400


### OTHER
# construction_year	55
# num_private	65
# amount_tsh	98
# date_recorded	356
# population	1049


### High Cardinality
# funder	1897
# installer	2145
# PDROP  scheme_name	2696 - High cardinality, low 

#['id', 'status_group', 'recorded_by', 'region', 'funder', 'district_code', 'extraction_type_group', 'management_group', 'quality_group', 'waterpoint_type_group', 'extraction_type_class', 'wpt_name']
#['region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type']

#['id', 'status_group', 'recorded_by', 'region', 'funder', 'district_code', 'wpt_name', 'extraction_type_group', 'management_group', 'quality_group', 'waterpoint_type_group', 'extraction_type_class']
#['region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type']

In [None]:
['id', 'status_group', 'recorded_by', 'region', 'funder', 'district_code', 'extraction_type_group', 'management_group', 'quality_group', 'waterpoint_type_group', 'extraction_type_class', 'wpt_name']
['region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type']
['basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'amount_tsh', 'gps_height', 'num_private', 'population', 'construction_year', 'x_coord', 'y_coord', 'z_coord', 'region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type']

In [24]:
df.waterpoint_type.value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [11]:
'''# "BASELINE FEATURES"
### Fix or Enhance Features
data = df
data = lat_long(data)
data = no_bool(data, ['permit', 'public_meeting'])

### Define those datasets
X = data
y, label_list = labeler(df, 'status_group')
y.head()

### feature thoughts
nope = ['recorded_by','construction_year', 'payment_type', 'quantity_group','source_type']
good = ['permit']
first = ['extraction_type','management', 'payment','water_quality', 'quantity','source', 'source_class','waterpoint_type']
second = ['extraction_type_group','management_group','quality_group','waterpoint_type_group']
third = ['extraction_type_class']

### Passthrough features. These are any I don't want to mess with. 
passthrough_features = []

###Testing
to_be_tested = ['subvillage']
to_be_tested_remove = ['construction_year', 'payment_type', 'quantity_group']
test_remove = second + third
test_add_oh = []
test_add_binary = []
test_add_numeric = []

### Ones that aren't actually numeric.
binary_features = ['region_code','installer','scheme_name','date_recorded','lga','subvillage','wpt_name']
hashing_features = []

### Dropping features
drop_features_tune = ['id', 'status_group','recorded_by',"region",'funder','district_code',]
drop_features_testing = ['wpt_name',]
drop_features = drop_features_tune + test_remove + drop_features_testing

### Defining my one-hot variables. 
one_hot_features = list(X.select_dtypes(include=['object']))
numeric_features = list(X.select_dtypes(include=['float64', 'int64']))

### Remove features from duplicates if present. 
for x in drop_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
    if x in binary_features: binary_features.remove(x)
    if x in hashing_features: hashing_features.remove(x)

for x in binary_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
        
for x in hashing_features: 
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)

binary_features = binary_features + one_hot_features

the_selected_features = passthrough_features + one_hot_features + numeric_features + binary_features + hashing_features

print(drop_features)
print(binary_features)
print(the_selected_features)'''

['id', 'status_group', 'recorded_by', 'region', 'funder', 'district_code', 'extraction_type_group', 'management_group', 'quality_group', 'waterpoint_type_group', 'extraction_type_class', 'wpt_name']
['region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type']
['basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'amount_tsh', 'gps_height', 'num_private', 'population', 'construction_year', 'x_coord', 'y_coord', 'z_coord', 'region_code', 'installer', 'scheme_name', 'date_recorded', 'lga', 'subvillage', 'basin', 'ward', 'public_meeting', 'scheme_management', 'per

In [21]:
base = ['amount_tsh', 'gps_height', 'basin', 'scheme_management', 
        'construction_year', 'extraction_type', 'management_group', 
        'water_quality', 'payment_type', 'source', 'waterpoint_type']
base_large = [ 'installer',  'funder']
base_fix = ['population']

# need = rural/non-rural
my_sug =[ 'longitude', 'latitude']

drop = [ 'status_group', 'id', 'date_recorded','longitude','latitude','wpt_name',
        'num_private','subvillage','region','region_code','district_code','lga','ward',
        'public_meeting', 'recorded_by', 'scheme_name', 'permit', 'extraction_type_group',
        'extraction_type_class', 'management', 'payment', 'quality_group', 'quantity',
        'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group'];

In [133]:
# "TESTING FEATURES"
### Fix or Enhance Features
data = df
#data = lat_long(data)
data = no_bool(data, ['permit', 'public_meeting'])

### Define those datasets
X = data
y, label_list = labeler(df, 'status_group')
y.head()

### Ones that aren't actually numeric.
pca_1_features = []
pca_2_features = ['installer']
numeric_features = ['amount_tsh', 'gps_height','construction_year']
hashing_features = []
binary_features = ['lga', 'basin',  'waterpoint_type','extraction_type_class',
                    'water_quality', 'payment_type', 'source', 'public_meeting',  'permit']
hashing_features = []

### Dropping features
drop_features = ['status_group', 'id', 'date_recorded','wpt_name','region','district_code',
        'num_private','recorded_by', 'scheme_name','extraction_type_group','extraction_type', 
         'management', 'payment', 'quality_group', 'quantity',
        'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group','latitude', 'longitude'];

dpca_features = ['ward','subvillage','region_code']

### Defining my one-hot variables. 
one_hot_features = list(X.select_dtypes(include=['object']))

### Remove features from duplicates if present. 
for x in drop_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
    if x in binary_features: binary_features.remove(x)
    if x in hashing_features: hashing_features.remove(x)

for x in binary_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
        
for x in hashing_features: 
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)

binary_features = binary_features + one_hot_features
the_selected_features = passthrough_features + one_hot_features + numeric_features + binary_features + hashing_features

print(drop_features)
print(binary_features)
print(the_selected_features)

['status_group', 'id', 'date_recorded', 'wpt_name', 'region', 'district_code', 'num_private', 'recorded_by', 'scheme_name', 'extraction_type_group', 'extraction_type', 'management', 'payment', 'quality_group', 'quantity', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group', 'latitude', 'longitude']
['lga', 'basin', 'waterpoint_type', 'extraction_type_class', 'water_quality', 'payment_type', 'source', 'public_meeting', 'permit', 'funder', 'installer', 'subvillage', 'ward', 'scheme_management', 'management_group']
['funder', 'installer', 'subvillage', 'ward', 'scheme_management', 'management_group', 'amount_tsh', 'gps_height', 'construction_year', 'lga', 'basin', 'waterpoint_type', 'extraction_type_class', 'water_quality', 'payment_type', 'source', 'public_meeting', 'permit', 'funder', 'installer', 'subvillage', 'ward', 'scheme_management', 'management_group']


In [137]:
# Preprocessing pipelines for both numeric and categorical data.
# Using column_transformer https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# Define my custom pipeline functions for each type of data. Columns not expressly included are dropped.  


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True,return_df=True))])

hashing_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('hashing', HashingEncoder())])

funder_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True,return_df=True)),
    ('pca1', PCA(n_components = 5))])

installer_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True,return_df=True)),
    ('pca2', PCA(n_components = 5))])

dPca_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True,return_df=True))])

########### TESTING PREPROCESSOR ########
# Create preprocessor pipeline

PreProcessorTest = ColumnTransformer(
    transformers=[
#        ('pass', 'passthrough', passthrough_features),
        ('drop', 'drop', drop_features),
        #('hash', hashing_transformer, hashing_features),
        ('biy', binary_transformer, binary_features),
        ('num', numeric_transformer, numeric_features),
        ('fndr',funder_transformer, pca_1_features),
        ('instr', installer_transformer, pca_2_features),
        #('o-h', one_hot_transformer, one_hot_features),
        ('dpca', dPca_transformer, dpca_features)
    ],
    n_jobs = -2)

########### BASELINE PREPROCESSOR ########
# Create preprocessor pipeline
PreProcessor = ColumnTransformer(
    transformers=[
#        ('pass', 'passthrough', passthrough_features),
        ('drop', 'drop', drop_features),
        #('hash', hashing_transformer, hashing_features),
        ('biy', binary_transformer, binary_features),
        ('num', numeric_transformer, numeric_features),
        #('fndr',funder_transformer, funder_features)
        #('o-h', one_hot_transformer, one_hot_features)
    ],
    n_jobs = -2)

In [138]:
# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1,random_state=seed)
print(X_train.shape, y_train.shape)


(53460, 41) (53460,)


In [139]:
PreProcessorTest.fit_transform(X_train, y_train).shape
# PreProcessor.fit_transform(X_train, y_train).shape

(53460, 124)

## Test Cell

In [140]:
# BASIC XGBC
clf4 = make_pipeline(PreProcessorTest, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.2,
                      max_depth=25, 
                      gamma=5,
                      nthread = 10,random_state = seed))

clf4.fit(X_train, y_train)
print("model score: %.6f" % clf4.score(X_train, y_train))
print("model score: %.6f" % clf4.score(X_val, y_val))

model score: 0.772596
model score: 0.754714


In [None]:
# BASIC XGBC
clf4 = make_pipeline(PreProcessorTest, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.008,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.0,
                      max_depth=20, 
                      gamma=1,
                      nthread = 10,random_state = seed))

clf4.fit(X_train, y_train)
print("model score: %.6f" % clf4.score(X_train, y_train))
print("model score: %.6f" % clf4.score(X_val, y_val))

In [93]:
# CV XGBC
clf3 = make_pipeline(PreProcessorTest, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.4,
                      max_depth=22, 
                      gamma=1,
                      nthread = 10,random_state = seed))

scores = cross_validate(clf3, X, y,
                        scoring='accuracy', cv=5,
                        return_estimator=True )

pd.DataFrame(scores).rename(columns={"test_score": 'validation_score'})

KeyboardInterrupt: 

## Feature Baseline - Run First, Dont Touch. 

In [16]:
# BASIC XGBC
clf1 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.0,
                      max_depth=17, 
                      gamma=1,
                      nthread = 10,random_state = seed))

clf1.fit(X_train, y_train)
print("model score: %.6f" % clf1.score(X_train, y_train))
print("model score: %.6f" % clf1.score(X_val, y_val))

model score: 0.920651
model score: 0.814983


In [17]:
# CV XGBC
clf2 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.4,
                      max_depth=17, 
                      gamma=1,
                      nthread = 10,random_state = seed))

scores = cross_validate(clf2, X, y,
                        scoring='accuracy', cv=5,
                        return_estimator=True )

pd.DataFrame(scores).rename(columns={"test_score": 'validation_score'})



Unnamed: 0,fit_time,score_time,estimator,validation_score,train_score
0,23.21708,0.909517,"Pipeline(memory=None,\n steps=[('columntra...",0.810454,0.896252
1,23.203826,0.913671,"Pipeline(memory=None,\n steps=[('columntra...",0.807255,0.896757
2,23.194623,0.906995,"Pipeline(memory=None,\n steps=[('columntra...",0.809175,0.898822
3,22.772361,0.819793,"Pipeline(memory=None,\n steps=[('columntra...",0.80564,0.898169
4,22.919886,0.916996,"Pipeline(memory=None,\n steps=[('columntra...",0.806196,0.899268


In [None]:
# To Retrieve a CV model
maiest = pd.DataFrame(scores).rename(columns={"test_score": 'validation_score'}).loc[0][2]

In [None]:
clf5 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.4,
                      max_depth=17, 
                      gamma=1, nthread = 10, random_state = seed))

clf5.fit(X_train, y_train)
print("model score: %.6f" % clf5.score(X_train, y_train))
print("model score: %.6f" % clf5.score(X_val, y_val))

In [None]:
# ANN

## XGBoost Tuning

In [None]:
## XGBoost Tuning: Tree Max Depth
XGPipe = make_pipeline(PreProcessor, XGBClassifier(silent=False,scale_pos_weight=1,learning_rate=0.01,  colsample_bytree = 0.8,subsample = 0.8,objective='binary:logistic', n_estimators=100, reg_alpha = 0.4,max_depth=17, gamma=1))

param_grid = {
    'xgbclassifier__max_depth':[5,10,15,17,19,20], # 20, 
    #'xgbclassifier__n_estimators':[50, 150, 250, 350, 450],
    #'xgbclassifier__colsample_bytree':[.3, .8, .9, 1.0],
    #'xgbclassifier__learning_rate':[0.001, 0.0045, 0.0065, 0.010], 
    #'xgbclassifier__reg_lambda':[0, 0.01, 0.10, 0.50, 1]
    }
grid = {}

# Fit on the train set, with grid search cross-validation
XGsearch = GridSearchCV(XGPipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=10, n_jobs=-2)

XGsearch.fit(X_train, y_train)
print("train model score: %.3f" % XGsearch.score(X_train, y_train))
print("validate model score: %.3f" % XGsearch.score(X_val, y_val))
best_params = XGsearch.best_params_
best_score = XGsearch.best_score_
print(best_params)

In [None]:
# XGBoost Tuning: Number of Trees(cycles)
XGPipe = make_pipeline(PreProcessor, XGBClassifier(silent=False,scale_pos_weight=1,learning_rate=0.01,  colsample_bytree = 0.8,subsample = 0.8,objective='binary:logistic', n_estimators=100, reg_alpha = 0.4, max_depth=17, gamma=1))

param_grid = {
    'xgbclassifier__n_estimators':[450, 550, 1000]
    #'xgbclassifier__colsample_bytree':[.3, .8, .9, 1.0],
    #'xgbclassifier__learning_rate':[0.001, 0.0045, 0.0065, 0.010], 
    #'xgbclassifier__reg_lambda':[0, 0.01, 0.10, 0.50, 1]
    }
grid = {}

# Fit on the train set, with grid search cross-validation
XGsearch = GridSearchCV(XGPipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=10, n_jobs=-2)

XGsearch.fit(X_train, y_train)
print("train model score: %.3f" % XGsearch.score(X_train, y_train))
print("validate model score: %.3f" % XGsearch.score(X_val, y_val))
best_params = XGsearch.best_params_
best_score = XGsearch.best_score_
print(best_params)

In [None]:
## XGBoost Tuning: (colsample)
XGPipe = make_pipeline(PreProcessor, XGBClassifier(silent=False,scale_pos_weight=1,learning_rate=0.01, colsample_bytree = 0.8,subsample = 0.8,objective='binary:logistic', n_estimators=450, reg_alpha = 0.4, max_depth=17, gamma=1))

param_grid = {
    'xgbclassifier__colsample_bytree':[.3, .8, .9, 1.0],
    #'xgbclassifier__learning_rate':[0.001, 0.0045, 0.0065, 0.010], 
    #'xgbclassifier__reg_lambda':[0, 0.01, 0.10, 0.50, 1]
    }
grid = {}

# Fit on the train set, with grid search cross-validation
XGsearch = GridSearchCV(XGPipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=10, n_jobs=-2)

XGsearch.fit(X_train, y_train)
print("train model score: %.3f" % XGsearch.score(X_train, y_train))
print("validate model score: %.3f" % XGsearch.score(X_val, y_val))
best_params = XGsearch.best_params_
best_score = XGsearch.best_score_
print(best_params)

In [None]:
## XGBoost Tuning: (colsample)
XGPipe = make_pipeline(PreProcessor, XGBClassifier(silent=False,scale_pos_weight=1,learning_rate=0.01, colsample_bytree = 0.3, subsample = 0.8,objective='binary:logistic', n_estimators=450, reg_alpha = 0.1, max_depth=17, gamma=1))

param_grid = {
    'xgbclassifier__learning_rate':[0.001, 0.0045, 0.0065, 0.010], 
    'xgbclassifier__reg_lambda':[0, 0.01, 0.10, 0.50, 1]
    }
grid = {}

# Fit on the train set, with grid search cross-validation
XGsearch = GridSearchCV(XGPipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=10, n_jobs=-2)

XGsearch.fit(X_train, y_train)
print("train model score: %.3f" % XGsearch.score(X_train, y_train))
print("validate model score: %.3f" % XGsearch.score(X_val, y_val))
best_params = XGsearch.best_params_
best_score = XGsearch.best_score_
print(best_params)

In [None]:
clf2.named_steps['xgbclassifier'].feature_importances_

In [None]:
pipe2 = make_pipeline(
    PreProcessor,  
    LogisticRegression())

param_grid2 = {
    'logisticregression__C': [0.01, 0.1, 1, 3 10]
    }

# Fit on the train set, with grid search cross-validation
gs2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=3, 
                      scoring='accuracy', 
                      verbose=10, n_jobs=-2)
gs2.fit(X_train, y_train)


print("model score: %.3f" % gs2.score(X_train, y_train))
print("model score: %.3f" % gs2.score(X_val, y_val))

### Trying with Ridge Classifier

In [None]:
pipe = make_pipeline(
    PreProcessor, 
    SelectKBest(f_classif), 
    RidgeClassifier())

param_grid = {
    'selectkbest__k': range(1, len(X_train.columns)+1), 
    'ridgeclassifier__alpha': [0.001, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    }

# Fit on the train set, with grid search cross-validation
gs = GridSearchCV(pipe, param_grid=param_grid, cv=5, 
                      scoring='accuracy', 
                      verbose=10, n_jobs=-2)
gs.fit(X_train, y_train)
print("model score: %.3f" % gs.score(X_train, y_train))
print("model score: %.3f" % gs.score(X_val, y_val))

### Deep diving on some of these features.


In [None]:
# Helpful Encoding :https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159
columns_pivot = ['region_code']
tabs_pivot = ['construction_year']
pd.crosstab([df.lga, df.ward], df.construction_year, margins=False, normalize = 'index')

In [None]:
from sklearn.feature_selection import RFECV
clf8 = make_pipeline(PreProcessor, RFECV(estimator=LogisticRegression(), step=1, min_features_to_select=1, cv=3, scoring='accuracy', verbose=10, n_jobs=-2))
clf8.fit(X_train, y_train)
print("model score: %.3f" % clf8.score(X_train, y_train))
print("model score: %.3f" % clf8.score(X_val, y_val))

In [None]:
clf1 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.4,
                      max_depth=17, 
                      gamma=1,
                      nthread = 10))

clf1.fit(X, y)

## Pre-process the test data, make predictions, and format submission

In [None]:
print(df_test.shape)
test_data = lat_long(df_test)
test_data['status_group'] = df_test.id
test_data = no_bool(test_data, ['permit', 'public_meeting'])
#test_data = PreProcessor.fit_transform(test_data)
print(test_data.shape)

In [None]:
#Make predictions using the features from the test data set
predictions = clf1.predict(test_data)

# Map our labels back onto the predictions
keys = label_list[0]
values = label_list[1]
status_labels = dict(zip(keys, values))
predictions_mapped = np.vectorize(status_labels.get)(predictions)

In [None]:
#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'id':df_test['id'],'status_group':predictions_mapped})

# make sure the shapes line up. 
print (df_test.shape)
print (submission.shape)

#Visualize the first 5 rows
submission.head()

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
wards = pd.DataFrame(df.ward.value_counts()).reset_index
filename = 'wards.csv'
wards.to_csv(filename,index=False)
print('Saved file: ' + filename)

In [None]:
wards = pd.DataFrame(df.ward.value_counts()).reset_index


# MISC Ref Material

In [None]:
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

get_column_names_from_ColumnTransformer(PreProcessor)

In [None]:
# Rewriting my Dummy Regression Baseline one as a function
def baseline(data):
    name = "Dummy Regression Baseline"
    # Split data into train and test
    X_train, X_test, y_train, y_test = split(data)

    # Define an estimator and param_grid
    # WHEN DEFINING YOU CAN GIVE IT A NAME OTHERWISE IT WILL USE THE PIPELINE NAME AUTOGEN NAME (name of the function but lowercase)
    pipe = make_pipeline(
        PreProcesser(), 
        DummyRegressor(strategy='mean'))
    pipe.fit(X_train, y_train)

    scorer = 'MAE'

    ### Get the scores with the MAE Function
    y_pred_train = pipe.predict(X_train)  
    y_pred_test = pipe.predict(X_test)

    train_score = mean_absolute_error(y_train, y_pred_train)
    test_score = mean_absolute_error(y_test, y_pred_test)
    score_variance = test_score - train_score
    cv_score = 0.0000000000000
    best_params = pipe.get_params
    best_estimator = ""
    selected_names = list(X_train.columns)
    unselected_names = []

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]


# Rewriting my GridSearch CV as a function 
def compare(data, name):
    X_train, X_test, y_train, y_test = split(data)

    pipe = make_pipeline(
        PreProcessor, 
        SelectKBest(f_regression), 
        Ridge())

    param_grid = {
        'selectkbest__k': range(1, len(X_train.columns)+1), 
        'ridge__alpha': [0.1, 1.0, 10.]
    }

    scorer = 'MAE'

    # Fit on the train set, with grid search cross-validation
    gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=0)
    gs.fit(X_train, y_train)

    train_score = -gs.score(X_train, y_train)
    test_score = -gs.score(X_test, y_test)
    score_variance = test_score - train_score
    cv_score = -gs.best_score_
    best_params = gs.best_params_
    best_estimator = gs.best_estimator_

    # selected features? 
    # 'selectkbest' is the autogenerated name of the SelectKBest() function in the pipeline
    selector = gs.best_estimator_.named_steps['selectkbest']
    all_names = X_train.columns

    # get_support returns a mask of the columns in True / False
    selected_mask = selector.get_support()
    # Passing the boolean list as the column names creates a masked list.  
    selected_names = list(all_names[selected_mask])
    unselected_names = list(all_names[~selected_mask])

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]

In [None]:
#pd.DataFrame({'Variable':X.columns,
#              'Importance':clf2.named_steps['xgbclassifier'].feature_importances_}).sort_values('Importance', ascending=False)
#pipe.steps[0][1].get_feature_names()