In [None]:
# Helpful Encoding :https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159


In [79]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np
from math import cos, sin

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from category_encoders.hashing import HashingEncoder
from category_encoders.binary import BinaryEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from keras.models import Model, Sequential
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

seed = 123456
np.random.seed(seed)
#!pip install category_encoders
#!pip install xgboost 
import xgboost as xgb

In [80]:
#df = pd.read_csv("../input/train_features.csv", header=0)

df = pd.read_csv("train_features.csv", header=0)
df_test = pd.read_csv("test_features.csv",header=0)
df_labels = pd.read_csv("train_labels.csv", header=0)
df_labels['status_group'].value_counts()
df = df.merge(df_labels, on='id')

In [109]:
df.waterpoint_type.value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [82]:
df_slim = df[df.status_group != 'functional needs repair']

In [110]:
df.waterpoint_type_group.value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

## REVISIT THIS DATA. DROP NAN ROWS? 5%

In [84]:
# Based on the most voted answer we can easily define a function that gives us a dataframe to preview the missing values and the % of missing values in each column:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [85]:
missing_values_table(df)

Your selected dataframe has 41 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
scheme_name,28166,47.4
scheme_management,3877,6.5
installer,3655,6.2
funder,3635,6.1
public_meeting,3334,5.6
permit,3056,5.1
subvillage,371,0.6


In [86]:
# Encode my Y label and return a list of my labels. 
def labeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    dataframe =  dataframe.copy()
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1,2]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list


def slimlabeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    dataframe =  dataframe.copy()
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list


# Turn Lat/Long into x,y,z, coord plane. 
def lat_long(dataframe):
    from math import cos, sin 
    dataframe =  dataframe.copy()
    dataframe['x_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: cos(x))
    dataframe['y_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: sin(x))
    dataframe['z_coord'] = dataframe.latitude.apply(lambda x: sin(x))
    dataframe = dataframe.drop(columns=['latitude', 'longitude'])
    return dataframe

# Fix silly boolean issue. 
def no_bool(dataframe, columns):
    dataframe =  dataframe.copy()
    for column in columns:
        dataframe[column] = dataframe[column].replace({True: 'Yes', False: 'No'})
    return dataframe
        

In [87]:
## This portion was tedious. I first looked at the .nunique counts for all of these then compared the value counts of each pair to select the features.
## Feature	value_count()

# DROP status_group	3 : 
# DROP id	59400: because duh. 
# DROP recorded_by	1 : Only one source. 

# KEEP public_meeting	2 : Boolean to object. 
# KEEP permit	2: Boolean to object.  

# KEEP source_class	3 
# KEEP source	10: These two are sufficiently different. 
# DROP source_type	7 : This one is source but with less detail. Not ordered. 

# DROP quantity_group	5 : These two are exactly the same. Drop either. 
# KEEP  quantity	5 : enough, insufficient, dry, seasonal, unknown  - possibly cat/ordinal

# KEEP management	12 : Most detailed, not ordered. 
# DROP management_group	5 : less-detailed, summary-grouping. Not ordered.
# DROP scheme_management	12 : Missing Values, similar info to management. 

# DROP payment	7
# KEEP payment_type	7 : Shorter Column headers. Identical otherwise.

# KEEP water_quality	8
# DROP quality_group	6

# DROP waterpoint_type_group	6
# KEEP waterpoint_type	7

# DROP extraction_type_class	7
# DROP extraction_type_group	13
# KEEP extraction_type	18


### GEOGRAPHY 
# DROP longitude	57516: Replace with X,Y,Z coordinates
# DROP latitude	57517

# KEEP basin	9 - Non-Regional boarder may overlap with some other indicators of location. 

# DROP region	21 
# KEEP region_code	27: Region Code contains more detail than Region. 

# DROP district_code	20
# KEEP lga	125 : Lga is district with urban/rural flag added in on some values. Lga is most detailed.

# ward	2092
# gps_height	2428
# subvillage	19287
# wpt_name	37400


### OTHER
# construction_year	55
# num_private	65
# amount_tsh	98
# date_recorded	356
# population	1049


### High Cardinality
# funder	1897
# installer	2145
# PDROP  scheme_name	2696 - High cardinality, low 



In [88]:
ward_counts = X['ward'].str.lower().value_counts()
X['ward_wpt_count'] = [val if ward_counts[val] > k else 'other for ward in X['ward'].str.lower()]
                       
                       
for i in names:
    temp_df.loc[temp_df[i].value_counts()[temp_df[i]].values < 20, i] = "RARE_VALUE"
                       
                       
#http://www.citypopulation.de/php/tanzania-northern-admin.php

SyntaxError: invalid syntax (<ipython-input-88-63ea48ad80e3>, line 2)

In [114]:
# Fix or Enhance Features
data = df
#data = lat_long(data)
data = no_bool(data, ['permit', 'public_meeting'])

# Define those datasets
X = data
y, label_list = labeler(df, 'status_group')
y.head()

# feature thoughts
nope = ['recorded_by','construction_year', 'payment_type', 'quantity_group','source_type']
good = ['permit']
first = ['management', 'payment','water_quality', 'quantity','source', 'source_class','waterpoint_type']
second = ['extraction_type','management_group','quality_group','waterpoint_type_group']
third = ['extraction_type_class']

# Passthrough features. These are any I don't want to mess with. 
passthrough_features = []

#Testing
to_be_tested = ['subvillage']
to_be_tested_remove = ['construction_year', 'payment_type', 'quantity_group']
test_remove = second + third
test_add_oh = []
test_add_binary = []
test_add_numeric = []

# Ones that aren't actually numeric.
binary_features = ['region_code','installer','scheme_name','date_recorded','lga','subvillage','wpt_name','extraction_type_group','waterpoint_type']
hashing_features = []

# Dropping features
drop_features_tune = ['id', 'status_group','recorded_by',"region",'funder','district_code',]
drop_features_testing = ['wpt_name','date_recorded']
drop_features = drop_features_tune + test_remove + drop_features_testing

# my need to convert region_code to 

# Ones to binary encode (high cardinality)
#binary_features = ['funder','installer','scheme_name', 'subvillage','region_code', 'district_code']

# Defining my one-hot variables. 
one_hot_features = list(X.select_dtypes(include=['object']))
numeric_features = list(X.select_dtypes(include=['float64', 'int64']))

# Remove features from duplicates if present. 
for x in drop_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
    if x in binary_features: binary_features.remove(x)
    if x in hashing_features: hashing_features.remove(x)

for x in binary_features:
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)
        
for x in hashing_features: 
    if x in one_hot_features: one_hot_features.remove(x)
    if x in numeric_features: numeric_features.remove(x)

binary_features = binary_features + one_hot_features
# List features
#print (numeric_features)
#print (drop_features)
print (one_hot_features)
print (binary_features)
the_selected_features = passthrough_features + one_hot_features + numeric_features + binary_features + hashing_features

['basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class']
['region_code', 'installer', 'scheme_name', 'lga', 'subvillage', 'extraction_type_group', 'waterpoint_type', 'basin', 'ward', 'public_meeting', 'scheme_management', 'permit', 'management', 'payment', 'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class']


In [115]:
# Preprocess Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True,return_df=True))])

hashing_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('hashing', HashingEncoder())])

# Create preprocessor pipeline
PreProcessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_features),
        ('biy', binary_transformer, binary_features),
        ('num', numeric_transformer, numeric_features)
        #('o-h', one_hot_transformer, one_hot_features)
    ],
    n_jobs = -2)

# Lets test it.

In [116]:
# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state = seed)
print(X_train.shape, y_train.shape)
PreProcessor.fit_transform(X_train, y_train).shape

(53460, 41) (53460,)


(53460, 113)

In [117]:
def get_column_names_(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

get_column_names_(PreProcessor)

['id',
 'status_group',
 'recorded_by',
 'region',
 'funder',
 'district_code',
 'extraction_type',
 'management_group',
 'quality_group',
 'waterpoint_type_group',
 'extraction_type_class',
 'wpt_name',
 'date_recorded',
 'region_code',
 'installer',
 'scheme_name',
 'lga',
 'subvillage',
 'extraction_type_group',
 'waterpoint_type',
 'basin',
 'ward',
 'public_meeting',
 'scheme_management',
 'permit',
 'management',
 'payment',
 'payment_type',
 'water_quality',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class']

## Feature Baseline

In [118]:
# Standard Scaler 
clf3 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.4,
                      max_depth=17, 
                      gamma=1,
                      nthread = 10,random_state = seed))

clf3.fit(X_train, y_train)
print("model score: %.6f" % clf3.score(X_train, y_train))
print("model score: %.6f" % clf3.score(X_val, y_val))

model score: 0.887299
model score: 0.812626


In [15]:
# MinMax Scaler
clf3 = make_pipeline(PreProcessor, XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.02,  
                      colsample_bytree = 0.3,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=100, 
                      reg_alpha = 0.4,
                      max_depth=17, 
                      gamma=1,
                      nthread = 10))

clf3.fit(X_train, y_train)
print("model score: %.6f" % clf3.score(X_train, y_train))
print("model score: %.6f" % clf3.score(X_val, y_val))

model score: 0.891339
model score: 0.814815


## Now for an attempt at a deep learning classifier for it. 

In [None]:
# DL  Preprocess Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Create preprocessor pipeline
PreProcessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_features),
        ('biy', binary_transformer, binary_features),
        ('num', numeric_transformer, numeric_features),
        #('o-h', one_hot_transformer, binary_features)
    ],
    n_jobs = -2)

In [None]:
# Need pd-dummies-friendly y value. 
y2 = df['status_group']
X2 = PreProcessor.fit_transform(X)

# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X2, y2, test_size=0.33, random_state = seed)
print(X_train.shape, y_train.shape)

In [None]:
# Some Keras magic
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

### A seven layer deep and 128 wide net. 

Nothing fancy, not even sure this would qualify as deep learning – but throw in some dropout between them to help it to not overfit. I like 7 layer dip.

Learning rate for the optimization method Adam might be something to tune on other datasets but here 0.001 seems to work nicely.

In [75]:
m = Sequential()
m.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.25))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.5))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.5))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.5))
m.add(Dense(128, activation='relu'))
m.add(Dropout(0.5))
m.add(Dense(len(np.unique(y)), activation='softmax'))

m.compile(
    optimizer=optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [76]:
m.fit(
    # Feature matrix
    X_train,
    # Target class one-hot-encoded
    pd.get_dummies(pd.DataFrame(y_train)).as_matrix(),
    # Iterations to be run if not stopped by EarlyStopping
    epochs=200,
    callbacks=[
        # Stop iterations when validation loss has not improved
        EarlyStopping(monitor='val_loss', patience=25),
        # Nice for keeping the last model before overfitting occurs
        ModelCheckpoint(
            'best.model',
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        )
    ],
    verbose=2,
    validation_split=0.1,
    batch_size=256,
)

  """


Train on 35818 samples, validate on 3980 samples
Epoch 1/200
 - 3s - loss: 0.9480 - acc: 0.5056 - val_loss: 0.8981 - val_acc: 0.5513

Epoch 00001: val_loss improved from inf to 0.89808, saving model to best.model
Epoch 2/200
 - 2s - loss: 0.9023 - acc: 0.5375 - val_loss: 0.8942 - val_acc: 0.5513

Epoch 00002: val_loss improved from 0.89808 to 0.89420, saving model to best.model
Epoch 3/200
 - 2s - loss: 0.8985 - acc: 0.5406 - val_loss: 0.8916 - val_acc: 0.5513

Epoch 00003: val_loss improved from 0.89420 to 0.89158, saving model to best.model
Epoch 4/200
 - 2s - loss: 0.8899 - acc: 0.5511 - val_loss: 0.8815 - val_acc: 0.5553

Epoch 00004: val_loss improved from 0.89158 to 0.88147, saving model to best.model
Epoch 5/200
 - 2s - loss: 0.8270 - acc: 0.6330 - val_loss: 0.7822 - val_acc: 0.6897

Epoch 00005: val_loss improved from 0.88147 to 0.78217, saving model to best.model
Epoch 6/200
 - 2s - loss: 0.7735 - acc: 0.6841 - val_loss: 0.7748 - val_acc: 0.6930

Epoch 00006: val_loss improved

 - 2s - loss: 0.5353 - acc: 0.7959 - val_loss: 0.7198 - val_acc: 0.7430

Epoch 00057: val_loss did not improve from 0.69173
Epoch 58/200
 - 2s - loss: 0.5327 - acc: 0.7951 - val_loss: 0.7186 - val_acc: 0.7432

Epoch 00058: val_loss did not improve from 0.69173
Epoch 59/200
 - 2s - loss: 0.5325 - acc: 0.7957 - val_loss: 0.7357 - val_acc: 0.7402

Epoch 00059: val_loss did not improve from 0.69173
Epoch 60/200
 - 2s - loss: 0.5329 - acc: 0.7964 - val_loss: 0.7162 - val_acc: 0.7455

Epoch 00060: val_loss did not improve from 0.69173
Epoch 61/200
 - 2s - loss: 0.5324 - acc: 0.7962 - val_loss: 0.7166 - val_acc: 0.7342

Epoch 00061: val_loss did not improve from 0.69173
Epoch 62/200
 - 2s - loss: 0.5258 - acc: 0.7974 - val_loss: 0.7228 - val_acc: 0.7450

Epoch 00062: val_loss did not improve from 0.69173
Epoch 63/200
 - 2s - loss: 0.5300 - acc: 0.7982 - val_loss: 0.6843 - val_acc: 0.7520

Epoch 00063: val_loss improved from 0.69173 to 0.68429, saving model to best.model
Epoch 64/200
 - 2s - l

<keras.callbacks.History at 0x7f6a4c1ec1d0>

In [62]:
# Load the best model
m.load_weights("best.model")

# Keep track of what class corresponds to what index
mapping = pd.get_dummies(pd.DataFrame(y_train), prefix='', prefix_sep='').columns.values
y_test_preds = [mapping[pred] for pred in m.predict(X_val).argmax(axis=1)]

In [70]:
pd.crosstab(
    pd.Series(y_val, name='Actual'),
    pd.Series(y_test_preds, name='Predicted'),
    margins=True, normalize=True
)

#prediction = model.predict(y_val)
#target_names = ['Positive','Neutral','Negative']
#print(confusion_matrix(y_val, y_test_preds))

Predicted,functional,non functional,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
functional,0.424331,0.112581,0.536912
functional needs repair,0.058905,0.017225,0.07613
non functional,0.310366,0.076592,0.386958
All,0.793602,0.206398,1.0


In [73]:
print ('Accuracy: {0:.3f}'.format(accuracy_score(y_val, y_test_preds)))

Accuracy: 0.734


In [74]:
test_data = lat_long(df_test)
test_data['status_group'] = df_test.id
test_data = no_bool(test_data, ['permit', 'public_meeting'])
X_test = PreProcessor.transform(test_data)
X_test.shape

(14358, 116)

## LSTM!

In [None]:
nb_words  = min(MAX_NB_WORDS, len(word_index))
lstm_out = MAX_SEQUENCE_LENGTH

model = Sequential()
model.add(Embedding(nb_words,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(50))
#model.add(Attention(MAX_SEQUENCE_LENGTH))
model.add(Dense(3, activation = 'softmax'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

## Pre-process the test data, make predictions, and format submission

In [119]:
clf3.fit(X, y)

Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=-2, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('drop', 'drop', ['id', 'status_group', 'recorded_by', 'region', 'funder', 'district_code', 'extraction_type', 'management_group', 'quality_grou...56, reg_alpha=0.4, reg_lambda=1,
       scale_pos_weight=1, seed=None, silent=True, subsample=0.8))])

In [120]:
print(df_test.shape)
test_data = lat_long(df_test)
test_data['status_group'] = df_test.id
test_data = no_bool(test_data, ['permit', 'public_meeting'])
#test_data = PreProcessor.fit_transform(test_data)
print(test_data.shape)

(14358, 40)
(14358, 42)


In [122]:
#Make predictions using the features from the test data set
predictions = clf3.predict(test_data)

#Display our predictions - they are either 0 or 1 for each training instance 
#depending on whether our algorithm believes the person survived or not.
predictions

# Map our labels back onto the predictions
keys = label_list[0]
values = label_list[1]
status_labels = dict(zip(keys, values))
predictions_mapped = np.vectorize(status_labels.get)(predictions)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [123]:
#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'id':df_test['id'],'status_group':predictions_mapped})

# make sure the shapes line up. 
print (df_test.shape)
print (submission.shape)

#Visualize the first 5 rows
submission.head()

(14358, 40)
(14358, 2)


Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [124]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Saved file: submission.csv


# MISC Ref Material

In [None]:
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

get_column_names_from_ColumnTransformer(PreProcessor)

In [None]:
# Rewriting my Dummy Regression Baseline one as a function
def baseline(data):
    name = "Dummy Regression Baseline"
    # Split data into train and test
    X_train, X_test, y_train, y_test = split(data)

    # Define an estimator and param_grid
    # WHEN DEFINING YOU CAN GIVE IT A NAME OTHERWISE IT WILL USE THE PIPELINE NAME AUTOGEN NAME (name of the function but lowercase)
    pipe = make_pipeline(
        PreProcesser(), 
        DummyRegressor(strategy='mean'))
    pipe.fit(X_train, y_train)

    scorer = 'MAE'

    ### Get the scores with the MAE Function
    y_pred_train = pipe.predict(X_train)  
    y_pred_test = pipe.predict(X_test)

    train_score = mean_absolute_error(y_train, y_pred_train)
    test_score = mean_absolute_error(y_test, y_pred_test)
    score_variance = test_score - train_score
    cv_score = 0.0000000000000
    best_params = pipe.get_params
    best_estimator = ""
    selected_names = list(X_train.columns)
    unselected_names = []

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]


# Rewriting my GridSearch CV as a function 
def compare(data, name):
    X_train, X_test, y_train, y_test = split(data)

    pipe = make_pipeline(
        PreProcessor, 
        SelectKBest(f_regression), 
        Ridge())

    param_grid = {
        'selectkbest__k': range(1, len(X_train.columns)+1), 
        'ridge__alpha': [0.1, 1.0, 10.]
    }

    scorer = 'MAE'

    # Fit on the train set, with grid search cross-validation
    gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=0)
    gs.fit(X_train, y_train)

    train_score = -gs.score(X_train, y_train)
    test_score = -gs.score(X_test, y_test)
    score_variance = test_score - train_score
    cv_score = -gs.best_score_
    best_params = gs.best_params_
    best_estimator = gs.best_estimator_

    # selected features? 
    # 'selectkbest' is the autogenerated name of the SelectKBest() function in the pipeline
    selector = gs.best_estimator_.named_steps['selectkbest']
    all_names = X_train.columns

    # get_support returns a mask of the columns in True / False
    selected_mask = selector.get_support()
    # Passing the boolean list as the column names creates a masked list.  
    selected_names = list(all_names[selected_mask])
    unselected_names = list(all_names[~selected_mask])

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]

In [None]:
#pd.DataFrame({'Variable':X.columns,
#              'Importance':clf2.named_steps['xgbclassifier'].feature_importances_}).sort_values('Importance', ascending=False)
#pipe.steps[0][1].get_feature_names()