# Table of Contents
- Imports
- Functions
- Data Understanding
- Data Cleaning
- Exploratory Data Analysis
- Model



## Imports

In [190]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.inspection import permutation_importance


## Functions

In [191]:
def grab_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

## Data Understanding

In [192]:
# loading up our data
X_train = pd.read_csv('../data/well_features.csv')
y_train = pd.read_csv('../data/well_labels.csv')
X_test = pd.read_csv('../data/well_test_labels.csv')

In [193]:
# Checking what columns/features we are working with
X_train.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [194]:
# Checking our target
y_train.columns

Index(['id', 'status_group'], dtype='object')

In [195]:
# Taking a look at what type of input is in each column
X_train.head(5)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [196]:
# Looking at the data types of each feature
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [197]:
# Exploring our target
y_train.head(5)

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [198]:
y_train['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

**Notes:**

Taking a quick peek at the data, we have a good amount of features to work with. Some features look to be generalized in another feature. An example would be 'quality' and 'quality_group'. There seems a good amount of features that do not provide value. Looking at our y_train, each well is classified as 'functional', 'functional needs repair', or 'non functional'.

## Data Cleaning

In [199]:
# Creating a list of columns/features to drop
dropped_list=['id','funder','wpt_name','num_private','basin','subvillage','region_code',
                'district_code','lga','ward','recorded_by','scheme_name','extraction_type_group',
                'extraction_type_class','management_group','payment_type','quality_group','quantity_group',
                'source_type','source_class','waterpoint_type_group']

In [200]:
# Setting our index to our id feature
X_train = X_train.set_index(X_train['id'])
y_train = y_train.set_index(y_train['id'])
X_test = X_test.set_index(X_test['id'])

# Dropping features in our dropped_list
X_train = X_train.drop(dropped_list,axis=1)
X_test = X_test.drop(dropped_list,axis=1)

# Dropping the extra id feature in y_train
y_train = y_train.drop('id',axis=1)

In [201]:
# Altering our date_recorded feature into a year format (ex. 2009)
X_test['date_recorded'] = X_test['date_recorded'].map(lambda x: x[:4]).astype('int64')
X_train['date_recorded'] = X_train['date_recorded'].map(lambda x: x[:4]).astype('int64')


In [202]:
# Creating a complete dataframe using train data
df_train = X_train.copy()
df_train = df_train.join(y_train, how='outer')

## Exploratory Data Analysis

In [203]:
fig = px.histogram(y_train, x='status_group', color='status_group')
fig.show()

In [204]:
fig = px.histogram(df_train, x='scheme_management', color='status_group')
fig.show()

In [205]:
fig = px.histogram(df_train, x='water_quality', color='status_group')
fig.show()

In [206]:
fig = px.histogram(df_train, x='quantity', color='status_group')
fig.show()

## Model Building

In [207]:
# The FunctionTransformer will turn my function
# into a transformer.

GrabNumeric = FunctionTransformer(grab_numeric)

In [208]:
pipe = Pipeline(steps=[('num', GrabNumeric),
                       ('ss', StandardScaler())])

In [209]:
pipe.fit(X_train)

Pipeline(steps=[('num',
                 FunctionTransformer(func=<function grab_numeric at 0x000002AA13E659D0>)),
                ('ss', StandardScaler())])

In [210]:
pipe.transform(X_train)

array([[ 1.89566509,  0.13105173, -1.40879062],
       [-0.10597003,  0.09460999,  1.20793411],
       [-0.09762988,  0.51515818,  0.63975146],
       ...,
       [-0.10597003, -0.00918778, -1.03340379],
       [-0.10597003,  0.2716288 , -0.22828986],
       [-0.10597003,  0.61312481, -0.35350768]])

In [211]:
subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer()),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [212]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [0,1,2,4,5,7,11]),
                                         ('subpipe_cat', subpipe_cat, [3,6,8,9,10,12,13,14,15,16,17,18])],
                           remainder='passthrough')

In [213]:
CT2 = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [0,1,2,3,4,6,10]),
                                         ('subpipe_cat', subpipe_cat, [5,7,8,9,11,12,13,14,15,16,17])],
                           remainder='passthrough')

### Model 1 - KNN

In [214]:
knn_model_pipe = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier())])

In [215]:
knn_model_pipe.fit(X_train,y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 4, 5, 7, 11]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
               

In [216]:
knn_param_grid = {'knn__n_neighbors':[3,6,9]}
knn_param_grid

{'knn__n_neighbors': [3, 6, 9]}

In [217]:
# defining parameter range
grid = GridSearchCV(knn_model_pipe, knn_param_grid, cv=3, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search = grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



KeyboardInterrupt: 

In [None]:
grid_search.best_params_
# 0.77765993

{'knn__n_neighbors': 3}

In [None]:
grid_search.cv_results_['mean_test_score']

array([0.77765993, 0.76919192, 0.76434343, 0.75961279])

In [26]:
param_grid = {'knn__n_neighbors':[1,2,4,5,6,7,8,9,11,12,13,14]}

# defining parameter range
grid = GridSearchCV(knn_model_pipe, param_grid, cv=5, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y,

In [27]:
grid_search.best_params_
# 0.77819865

{'knn__n_neighbors': 6}

In [28]:
grid_search.cv_results_['mean_test_score']

array([0.75436027, 0.76582492, 0.77750842, 0.77789562, 0.77819865,
       0.77740741, 0.77560606, 0.77552189, 0.77281145, 0.7726431 ,
       0.77173401, 0.77040404])

In [29]:
param_grid = {'knn__n_neighbors':[10,16,17,18]}

# defining parameter range
grid = GridSearchCV(knn_model_pipe, param_grid, cv=5, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y,

In [30]:
grid_search.best_params_
# 0.77422559

{'knn__n_neighbors': 10}

In [31]:
grid_search.cv_results_['mean_test_score']

array([0.77422559, 0.76861953, 0.76759259, 0.76691919])

# FINISH LINE FOR KNN

In [40]:
dt_model_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', DecisionTreeClassifier(random_state=2))
 ])

In [41]:
param_grid = {'rf_clf__max_depth':[5,10]}

# defining parameter range
grid = GridSearchCV(dt_model_pipe, param_grid, cv=5, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.8min finished


In [42]:
grid_search.best_params_

{'rf_clf__max_depth': 10}

In [43]:
grid_search.cv_results_['mean_test_score']

array([0.71340067, 0.74328283])

In [47]:
bag_model_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', BaggingClassifier(random_state=2))
 ])

In [51]:
param_grid = {'rf_clf__n_estimators':[100]}

# defining parameter range
grid = GridSearchCV(bag_model_pipe, param_grid, cv=5, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 53.6min finished
  return f(**kwargs)


In [52]:
grid_search.best_params_
# 0.80208754

{'rf_clf__n_estimators': 100}

In [53]:
grid_search.cv_results_['mean_test_score']

array([0.80208754])

### bag_model2_pipe

In [60]:
bag_model2_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', BaggingClassifier(random_state=2))
]

In [57]:
param_grid2 = {'rf_clf__n_estimators':[100]}

# defining parameter range
grid2 = GridSearchCV(bag_model2_pipe, param_grid2, cv=5, return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search2 = grid2.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 53.9min finished
  return f(**kwargs)


In [58]:
grid_search2.best_params_

{'rf_clf__n_estimators': 100}

In [59]:
grid_search2.cv_results_['mean_test_score']

array([0.80208754])

### bag_model3_pipe

In [62]:
bag_model3_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', BaggingClassifier(random_state=2))
])

In [64]:
param_grid3 = {'rf_clf__n_estimators':[200]}

# defining parameter range
grid3 = GridSearchCV(bag_model3_pipe, param_grid3, cv=5, return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search3 = grid3.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 116.5min finished
  return f(**kwargs)


In [65]:
grid_search3.best_params_

{'rf_clf__n_estimators': 200}

In [66]:
grid_search3.cv_results_['mean_test_score']

array([0.80259259])

### bag_model4_pipe

In [67]:
bag_model4_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', BaggingClassifier(random_state=2))
])

In [68]:
param_grid4 = {'rf_clf__n_estimators':[500]}

# defining parameter range
grid4 = GridSearchCV(bag_model4_pipe, param_grid4, cv=5, return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search4 = grid4.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 254.2min finished
  return f(**kwargs)


In [69]:
grid_search4.best_params_

{'rf_clf__n_estimators': 500}

In [70]:
grid_search4.cv_results_['mean_test_score']

array([0.80279461])

In [15]:
bag_model5_pipe = Pipeline([
         ('ct', CT),
         ('rf_clf', BaggingClassifier(n_estimators= 200, random_state=2))
])

In [26]:
bag_model5_pipe.fit(X_train, y_train)

  return f(**kwargs)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 4, 5, 7, 11]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
               

In [27]:
y_hat = bag_model5_pipe.predict(X_test)
y_hat

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [20]:
test = pd.read_csv('../data/well_test_labels.csv')

id_ = test['id']
installer_ = test['installer']

In [29]:
results = pd.DataFrame({'id':id_,'status_group': y_hat})

In [30]:
results

Unnamed: 0,id,predictions
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional


In [31]:
results.to_csv('../data/results.csv')

In [28]:
X_train_tf = X_train

X_


NameError: name 'X_' is not defined

In [55]:
df_tf = df_train
df_tf['status_group'] = df_tf['status_group'].map(lambda x: 1 if (x=='functional needs repair') else 0)

X_tf = df_tf.drop('status_group', axis=1)
y_tf = df_tf['status_group']

X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, random_state=2)

# Run this once or it will go bust

In [56]:
pipe_tf = Pipeline(steps=[('ct', CT),
                            ('rfc', DecisionTreeClassifier(random_state=2))])



In [57]:
pipe_tf_cv_scores = cross_val_score(pipe_tf, X_train_tf, y_train_tf, cv=2, verbose=1)
pipe_tf_cv_scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   40.9s finished


array([0.904422  , 0.90630752])

In [58]:
pipe2_tf = Pipeline(steps=[('ct', CT),
                            ('rfc', DecisionTreeClassifier(max_depth= 20, random_state=2))])

In [59]:
pipe2_tf_cv_scores = cross_val_score(pipe2_tf, X_train_tf, y_train_tf, cv=2, verbose=1)
pipe2_tf_cv_scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.7s finished


array([0.92107744, 0.92040404])

In [60]:
y_train_tf.value_counts()

0    41323
1     3227
Name: status_group, dtype: int64

In [None]:
r = permutation_importance(bag_model5_pipe, X_train.drop('installer', axis=1), y_train, n_repeats=1)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{df_train.feature_names[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

In [170]:
CT2 = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [0,1,2,3,4,6,10]),
                                         ('subpipe_cat', subpipe_cat, [5,7,8,9,11,12,13,14,15,16,17])],
                           remainder='passthrough')

In [70]:
X_train_new = X_train.drop('installer', axis=1)
X_train_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         59400 non-null  float64
 1   date_recorded      59400 non-null  int64  
 2   gps_height         59400 non-null  int64  
 3   longitude          59400 non-null  float64
 4   latitude           59400 non-null  float64
 5   region             59400 non-null  object 
 6   population         59400 non-null  int64  
 7   public_meeting     56066 non-null  object 
 8   scheme_management  55523 non-null  object 
 9   permit             56344 non-null  object 
 10  construction_year  59400 non-null  int64  
 11  extraction_type    59400 non-null  object 
 12  management         59400 non-null  object 
 13  payment            59400 non-null  object 
 14  water_quality      59400 non-null  object 
 15  quantity           59400 non-null  object 
 16  source            

In [73]:
bag_model6_pipe = Pipeline([
         ('ct', CT2),
         ('rf_clf', BaggingClassifier(n_estimators= 200, random_state=2))
])

In [74]:
bag_model6_pipe.fit(X_train_new,y_train)

  return f(**kwargs)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 3, 4, 6, 10]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
               

In [75]:
bag_model6_pipe.score(X_train_new, y_train)

0.9954545454545455

In [77]:
df_ct = CT2.fit_transform(X_train_new)

In [80]:
new_model_score = cross_val_score(bag_model6_pipe, X_train_new, y_train, cv=5)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [81]:
new_model_score

array([0.80799663, 0.80260943, 0.8020202 , 0.7989899 , 0.79957912])

In [82]:
r = permutation_importance(bag_model6_pipe, X_train_new, y_train, n_repeats=1)

In [83]:
df_new = X_train_new.join(y_train, how='outer')

In [91]:
feature_list = list(df_new.columns)

feature_list[0]

'amount_tsh'

In [98]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{df_new.columns[i]:<8}"
               f" {r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

longitude 0.186 +/- 0.000
quantity 0.176 +/- 0.000
latitude 0.145 +/- 0.000
waterpoint_type 0.141 +/- 0.000
amount_tsh 0.105 +/- 0.000
construction_year 0.086 +/- 0.000
extraction_type 0.055 +/- 0.000
population 0.048 +/- 0.000
management 0.044 +/- 0.000
gps_height 0.044 +/- 0.000
region   0.034 +/- 0.000
payment  0.033 +/- 0.000
source   0.033 +/- 0.000
water_quality 0.008 +/- 0.000
scheme_management 0.008 +/- 0.000
public_meeting 0.008 +/- 0.000
date_recorded 0.005 +/- 0.000
permit   0.005 +/- 0.000


In [96]:
df_new['waterpoint_type']

id
69572             communal standpipe
8776              communal standpipe
34310    communal standpipe multiple
67743    communal standpipe multiple
19728             communal standpipe
                    ...             
60739             communal standpipe
27263             communal standpipe
37057                      hand pump
31282                      hand pump
26348                      hand pump
Name: waterpoint_type, Length: 59400, dtype: object

In [None]:
new_model_score.mean()

In [160]:
testing_y = y_train.to_numpy()
testing_y

array([['functional'],
       ['functional'],
       ['functional'],
       ...,
       ['functional'],
       ['functional'],
       ['functional']], dtype=object)

In [161]:
bag_model7_pipe = Pipeline([
         ('ct', CT2),
         ('rf_clf', BaggingClassifier(n_estimators= 200, random_state=2))
])

In [165]:
bag_model7_pipe.fit(X_train_new, y_train.values.ravel())

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 3, 4, 6, 10]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
               

In [166]:
testing2_y = y_train.values.ravel()
testing2_y

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'functional'], dtype=object)

In [167]:
new_model_score = cross_val_score(bag_model7_pipe, X_train_new, testing2_y, cv=5)

In [168]:
new_model_score

array([0.80799663, 0.80260943, 0.8020202 , 0.7989899 , 0.79957912])

In [169]:
testing2_y

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'functional'], dtype=object)

In [186]:
knn_model_pipe2 = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier(n_neighbors=6))])

In [187]:
knn_model_pipe2.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 4, 5, 7, 11]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
               

In [188]:
new_model_score = cross_val_score(knn_model_pipe2, X_train, y_train, cv=5)
new_model_score


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



array([0.78207071, 0.7736532 , 0.77617845, 0.77685185, 0.78223906])

In [183]:
new_model_score.mean()
# 0.7743939393939394

0.7781986531986532