# CatBoost model 

In [221]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, make_scorer
import matplotlib.pyplot as plt

In [222]:
#import the data
train = pd.read_csv("train2.0.csv")
test_x = pd.read_csv("test2.0.csv")
data = pd.read_csv("names.csv")

#set ID as index for train x and y 
train = train.set_index(['id'])

#create x/y split for trianing 
train_y = pd.DataFrame(train['status_group'])
train_x = train.drop(['status_group'], axis = 1)

#save test id and drop 
test_id = pd.DataFrame(test_x['id'])
test_x = test_x.drop(['id'], axis = 1)

In [227]:
#Imputing missing values for both train and test
#needed for the format of catboost
train_x.fillna(-999, inplace=True)
test_x.fillna(-999,inplace=True)

#convert Y to status group to 1,2,3 
replace_map = {"non functional":3, 
              "functional needs repair":2,
              "functional":1}

train_y['status_group code'] = train_y['status_group'].replace(replace_map)

#set data type for non int numbers
train_x = train_x.astype({"amount_tsh": int})
test_x = test_x.astype({"amount_tsh": int})

## Grid Search
This section of the code is commented out because it takes a long time to run grid search on the amount of data used for the CatBoost model.   

After the best parameters were chosen they were added manually to the full model after grid search


In [232]:
# # transform the dataset
# y = train_y['status_group code']

# #split data just for testing 
# X_train, X_test, Y_train, Y_test = train_test_split(train_x, y, train_size=0.6, random_state=42)
# #Identify cat features for model
# categorical_features_indices = np.where(train_x.dtypes != np.float)[0]

In [235]:
# #grid search parameters for catboost

# params = {'iterations': [1000],
#           'learning_rate': [0.01,0.05,.1],
#           'depth': [4,6,10],
#           'loss_function': ['MultiClass'],
#           'logging_level':['Silent'],
#           'random_seed': [42],
#           'eval_metric' :['AUC']}
# clf = CatBoostClassifier()

# params = {'depth':[10],
#           'iterations':[1000],
#           'learning_rate':[0.01,0.05,.1],
#             'logging_level':['Silent'],
#           'l2_leaf_reg':[10,50],
#           'eval_metric' :['AUC']}


# scorer = make_scorer(accuracy_score)
# clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=10, n_jobs = -1)

# clf_grid.fit(X_train, Y_train, cat_features=categorical_features_indices)
# best_param = clf_grid.best_params_

# # send best param to 
# pd.DataFrame.from_dict(best_param, orient='index').to_csv('best_param_catboost.csv')
# best_param

In [None]:
#testing gridsearched parameters 
# # test model 
# model_test = CatBoostClassifier(depth = 10, 
#                                iterations = 1000,
#                                learning_rate = .1,
#                                l2_leaf_reg = 2,
#                                leaf_estimation_iterations = 10,
#                                loss_function = 'MultiClass',
#                                random_seed = 42,
#                                logging_level = 'Silent')
                   
# model_test.fit(X_train, Y_train ,cat_features=categorical_features_indices)
# preds_class_full = model_test.predict(X_test)
# accuracy_score(preds_class_full,Y_test)
# array = confusion_matrix(preds_class_full,Y_test)
# np.asmatrix(array)/(sum(sum(array)))

In [None]:
## caculating feature importance to consifer for feature tunning
# importance = pd.DataFrame({'feature_importance': model_test.get_feature_importance(), 
#               'feature_names': train_x.columns}).sort_values(by=['feature_importance'], 
#                                                            ascending=False)

# Finial Model

In [237]:
# full model based on grid search parameters
# running full model 9 times on different random seeds to ensemble predictions

import random
categorical_features_indices = np.where(train_x.dtypes != np.float)[0]
predictions = test_id
for i in range(9):
    #full model    
    randomseedx = random.randint(1,10000)
    
    model_full = CatBoostClassifier(depth = 10, 
                                   iterations = 1000,
                                   loss_function = 'MultiClass',
                                   random_seed = randomseedx,
                                   logging_level = 'Silent',
                                   l2_leaf_reg  = 10,
                                   learning_rate = .05)

    model_full.fit(train_x, train_y['status_group code'] ,cat_features=categorical_features_indices)
    #Create predictions
    preds_class_full = pd.DataFrame(model_full.predict(test_x))
    names = 'Model ' + str(i+1)
    predictions[names] = preds_class_full
    print(names + " Has completed")


Model 1 Has completed
Model 2 Has completed
Model 3 Has completed
Model 4 Has completed
Model 5 Has completed
Model 6 Has completed
Model 7 Has completed
Model 8 Has completed
Model 9 Has completed


Unnamed: 0,id,Model 1,Model 2,Model 3,Model 4,Model 5,Model 6,Model 7,Model 8,Model 9
0,50785,1,1,1,1,1,1,1,1,1
1,51630,1,1,1,1,1,1,1,1,1
2,17168,1,1,1,1,1,1,1,1,1
3,45559,3,3,3,3,3,3,3,3,3
4,49871,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
14845,39307,3,3,3,3,3,3,3,3,3
14846,18990,1,1,1,1,1,1,1,1,1
14847,28749,1,1,1,1,1,1,1,1,1
14848,33492,1,1,1,1,1,1,1,1,1


In [242]:
# ensemble and fomrating predictions 
idval = predictions['id']

#saving prediction to csv file 
predictions.to_csv("catboost_all_9_models_predictions.csv", index=False)
predicitons = predictions.drop(['id'], axis=1)
# ensemble prediction by taking mode of 9 predictions
predicitons= predicitons.mode(axis=1)

#formating for output 
submission = pd.concat([idval,predicitons], axis = 1)
submission.columns = ['id', 'status_group','error']
submission = submission[['id', 'status_group']]

#maping labels for contest submission format 
replace_map2 = {3:"non functional", 
              2:"functional needs repair",
              1:"functional"}
submission['status_group'] = submission['status_group'].replace(replace_map2)
# exporting submission to csv
submission.to_csv("submision_catboost.csv", index=False)


In [243]:
# view submission
submission

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional
