In [7]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, make_scorer

In [8]:
#import the data
train_x = pd.read_csv("TrainingSetValues.csv")
trian_y = pd.read_csv("TrainingSetLabels.csv")
test_x = pd.read_csv("TestSetValues.csv")
data = pd.read_csv("names.csv")

In [9]:
#Imputing missing values for both train and test
train_x.fillna(-999, inplace=True)
test_x.fillna(-999,inplace=True)

In [10]:
# train_x.isnull().sum()
#convert output statu group to 1,2,3 
 
replace_map = {"non functional":3, 
              "functional needs repair":2,
              "functional":1}

trian_y['status_group code'] = trian_y['status_group'].replace(replace_map)

In [11]:
#subset data train 
names = data.name.tolist()
train_x = train_x[names]
train_x = train_x.astype({"amount_tsh": int})

train_x.dtypes
#train_x.isnull().sum()

amount_tsh                int64
funder                   object
gps_height                int64
installer                object
basin                    object
region_code               int64
district_code             int64
lga                      object
ward                     object
population                int64
public_meeting           object
scheme_management        object
scheme_name              object
permit                   object
construction_year         int64
extraction_type_class    object
management_group         object
payment                  object
quality_group            object
quantity                 object
source_type              object
source_class             object
waterpoint_type          object
dtype: object

In [12]:
#subset test data 
test_id = pd.DataFrame(test_x['id'])

test_x = test_x[names]
test_x = test_x.astype({"amount_tsh": int})

# test_x.dtypes
#train_x.isnull().sum()



In [13]:
#split data just for testing 
X_train, X_test, Y_train, Y_test = train_test_split(train_x, trian_y['status_group code'], train_size=0.7, random_state=1234)
#Identify cat features for model
categorical_features_indices = np.where(train_x.dtypes != np.float)[0]

In [28]:
clf = CatBoostClassifier()
params = {'iterations': [100],
          'learning_rate': [0.01,.05, 0.1,.5, 1],
          'depth': [4,6,10],
          'loss_function': ['MultiClass'],
          'l2_leaf_reg': [2,10,20,30,50],
          'leaf_estimation_iterations': [10],
          'random_seed': [42]
         }
scorer = make_scorer(accuracy_score)
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=5)

In [29]:
clf_grid.fit(X_train, Y_train, cat_features=categorical_features_indices)
best_param = clf_grid.best_params_
best_param

0:	learn: 1.0895215	total: 100ms	remaining: 9.94s
1:	learn: 1.0807114	total: 180ms	remaining: 8.81s
2:	learn: 1.0721720	total: 284ms	remaining: 9.17s
3:	learn: 1.0636786	total: 373ms	remaining: 8.95s
4:	learn: 1.0554768	total: 468ms	remaining: 8.89s
5:	learn: 1.0469789	total: 615ms	remaining: 9.63s
6:	learn: 1.0391353	total: 743ms	remaining: 9.87s
7:	learn: 1.0315464	total: 888ms	remaining: 10.2s
8:	learn: 1.0241624	total: 1.01s	remaining: 10.2s
9:	learn: 1.0166501	total: 1.09s	remaining: 9.81s
10:	learn: 1.0094205	total: 1.17s	remaining: 9.45s
11:	learn: 1.0025386	total: 1.28s	remaining: 9.39s
12:	learn: 0.9958180	total: 1.41s	remaining: 9.4s
13:	learn: 0.9891875	total: 1.56s	remaining: 9.58s
14:	learn: 0.9828290	total: 1.69s	remaining: 9.55s
15:	learn: 0.9763476	total: 1.82s	remaining: 9.57s
16:	learn: 0.9700971	total: 1.95s	remaining: 9.52s
17:	learn: 0.9638024	total: 2.08s	remaining: 9.45s
18:	learn: 0.9578516	total: 2.21s	remaining: 9.43s
19:	learn: 0.9519787	total: 2.34s	remainin

KeyboardInterrupt: 

In [27]:
#importing library and building model

model_test = CatBoostClassifier(depth = 6, 
                               iterations = 5,
                               l2_leaf_reg = 2,
                               leaf_estimation_iterations = 10,
                               loss_function = 'MultiClass',
                               random_seed = 42)
                   
model_test.fit(X_train, Y_train ,cat_features=categorical_features_indices)
preds_class_full = model_test.predict(X_test)
accuracy_score(preds_class_full,Y_test)



0:	learn: 1.0681769	total: 115ms	remaining: 459ms
1:	learn: 1.0417722	total: 236ms	remaining: 354ms
2:	learn: 1.0176472	total: 412ms	remaining: 275ms
3:	learn: 0.9942744	total: 521ms	remaining: 130ms
4:	learn: 0.9728714	total: 625ms	remaining: 0us


0.7153198653198654

In [23]:
# #full model 
# #importing library and building model

# model_full = CatBoostClassifier(iterations=1000,
#                            depth=10,
#                            loss_function='MultiClass')
                   
# model_full.fit(train_x, trian_y['status_group code'],cat_features=categorical_features_indices)
# preds_class_full = model.predict(test_x)


In [None]:
#submision
preds_class_full = pd.DataFrame(preds_class_full)
submission = pd.concat([test_id, preds_class_full], axis=1)
submission.columns = ['id', 'status_group']
replace_map2 = {3:"non functional", 
              2:"functional needs repair",
              1:"functional"}

submission['status_group'] = submission['status_group'].replace(replace_map2)
submission.to_csv("submision.csv", index=False)