In [264]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


In [265]:
#load the datasets
dataset_values = pd.read_csv('data/training_set_values.csv')
dataset_labels = pd.read_csv('data/training_set_labels.csv')

In [266]:
#then combine the values and labels on 'id'
dataset = dataset_values.merge(dataset_labels, on='id')

# Preprocess 'date_recorded' into more usable features
# dataset['date_recorded'] = pd.to_datetime(dataset['date_recorded'])
# dataset['year_recorded'] = dataset['date_recorded'].dt.year
# dataset['month_recorded'] = dataset['date_recorded'].dt.month
# dataset['day_recorded'] = dataset['date_recorded'].dt.day

In [267]:

pd.set_option('display.max_columns', None)

#grabbing categorical and numerical features (automated)
categorical_features = dataset.select_dtypes(include=['object', 'bool']).columns
numerical_features = dataset.select_dtypes(exclude=['object', 'bool', 'datetime64[ns]']).columns


#manual fixes
# numerical_features.drop('construction_year')
# categorical_features.append('construction_year')

datetime_features = ['data_recorded'] #datetime feature


print("Current columns in DataFrame:", dataset.columns.tolist())
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

Current columns in DataFrame: ['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']
Categorical features: Index(['date_recorded', 'funder', 'installer', 'wpt_name', 'basin',
       'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'pay

In [268]:
#preprocessign steps for numerical features using pipeline, imputer (missing values) then scaler
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#handling datetime features within pipeline
# def extract_datetime_features(df):
#     df['date_recorded'] = pd.to_datetime(df['date_recorded'])
#     df['year_recorded'] = df['date_recorded'].dt.year
#     df['month_recorded'] = df['date_recorded'].dt.month
#     df['day_recorded'] = df['date_recorded'].dt.day
#     return df.drop('date_recorded', axis=1)


In [269]:
# dataset = extract_datetime_features(dataset)

# datetime_features.remove('date_recorded')
# datetime_features.extend(['year_recorded', 'month_recorded', 'day_recorded'])

#now finally, combine it all
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

# choice of model

model = RandomForestClassifier()

# create final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [270]:
X = dataset.drop(['id','status_group'], axis=1)
y = dataset['status_group']

kf = KFold(n_splits=5, shuffle=True)

scores = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {scores}')
print(f'Average accuracy: {np.mean(scores)}')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dn57/.local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'id'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/utils/__init__.py", line 505, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/home/dn57/.local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'id'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/dn57/.local/lib/python3.9/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 906, in fit_transform
    self._validate_column_callables(X)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 496, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/home/dn57/.local/lib/python3.9/site-packages/sklearn/utils/__init__.py", line 513, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
