### Import required libraries for model implementation

In [22]:
#Basics
import pandas as pd
import numpy as np


#Train Test Split
from sklearn.model_selection import train_test_split

# Imputer
from sklearn.impute import SimpleImputer

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')


# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder, 
    StandardScaler, 
    FunctionTransformer)

#Pipeline
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

# Classifiers
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

#### Import Data
Import the data files from local machine to google colabs and also retain the index column for the modeling as required for submission format of the prediction values.

In [23]:
from google.colab import files
import io

In [24]:
feature = files.upload()

Saving training_features.csv to training_features (1).csv


In [None]:
labels=files.upload()

Saving training_labels.csv to training_labels.csv


In [None]:
features_set = pd.read_csv(io.BytesIO(feature['training_features.csv']), index_col='id')
targets = pd.read_csv(io.BytesIO(labels['training_labels.csv']), index_col='id')
dataset = features.join(targets, how='left')
X_Set = dataset.drop('status_group', axis=1)
y_Set = dataset['status_group']

### For dropping the columns or features from the training data for model improvement

In [None]:
dropcolumns = []

#### Make Test Train Split
for the model training split the data set and Add the default random state for reproductibility in following needs

In [None]:
random_state = 42

In [None]:
training_X, testing_X, y_train, y_test = train_test_split(
    X_Set, 
    y_Set, 
    test_size=0.1, 
    random_state=random_state
)

#### Data Validation 
there are some issues with mixed data types in both permit and public_meeting columns therefore converting the categorical variables to strings to eiminate those errors.

In [None]:
def categorical_string_convert(data):
    return pd.DataFrame(data).astype(str)

Categorical_Converter = FunctionTransformer(
    categorical_string_convert
)

#### Classify Variables




In [None]:
def columns_classifier(dataset, dropcolumns):
    
    columns = dataset.columns
    keep_cols = [column for column in columns if column not in dropcolumns]
    category_column = []#categorical columns
    numerical_column = []#numerical columns
    for column in keep_cols:
        if dataset[column].dtype == object:
            category_column.append(column)
        else:
            numerical_column.append(column)
    return category_column, numerical_column

In [None]:
category_column, numerical_column = columns_classifier(training_X, dropcolumns)

### preprocessor building.
Pipeline step for the preprocessing that handle the all data.

#### Preprocessor pipeline for all categorical data

1. For avoiding the data type errors, first of all convert all categorical columns to string.
2. Fill the missing column values.
3. Use one-hot encode to all categorical values. There might be a possibility of handling the unknown values that are present in the testing data, which may not be encounter in the training set. For that using encounter will help to ignore these problems.

In [None]:
pipeline_categorical = Pipeline(
    steps=[
        (
            'Convertingtype', 
            Categorical_Converter
        ),
        (
            'imputer', 
            SimpleImputer(
                strategy='constant', 
                fill_value='missing'
            )
        ),
        (
            'standardizer', 
            OneHotEncoder(
                handle_unknown='ignore',
                dtype=float
            )
        )
    ]
)

#### Preprocessing pipeline for numerical value
1. Using the median value from the column to imputes the missing values in the numerical columns
2. For the each variable having mean zero and standard deviation one to scale

In [None]:
pipeline_numerical = Pipeline(
    steps=[
        (
            'imputer', 
            SimpleImputer(
                strategy='median'
            )
        ),
        (
            'standardizer', 
            StandardScaler()
        )
    ]
)

#### Pipeline for Preprocessing 
 

Following steps are used for the pre-processing behaviors.
1. Apply categorical pipeline.
2. Apply numerical pipeline.
3. drop the specified columns.

In [None]:
if len(dropcolumns) > 0:
    preprocessor = ColumnTransformer(
        transformers=[
            (
                'numericalPreprocessor', 
                pipeline_numerical, 
                numerical_column
            ),
            (
                'categoricalPreprocessor', 
                pipeline_categorical, 
                category_column
            ),
            (
                'dropPreprocessor', 
                'drop', 
                dropcolumns
            )
        ]
    )
else:
    preprocessor = ColumnTransformer(
        transformers=[
            (
                'numericalPreprocessor', 
                pipeline_numerical, 
                numerical_column
            ),
            (
                'categoricalPreprocessor', 
                pipeline_categorical, 
                category_column
            )
        ]
    )

### building the model pipeline.

firstly apply the preprocessing steps to the raw data.
And then using the eXtreme Gradient Boosted forest model to fit the pre processed data

In [None]:
pipeline = Pipeline(
    steps=[
        (
            'preprocessor', 
            preprocessor
        ),
        (
            'classifier', 
            OneVsRestClassifier(
                estimator='passthrough'
            )
        )
    ]
)

### Building Parameter Grid
Define a grid of hyper-parameters for the pipeline which will be tested in the grid search.

In [None]:
grid_parameter = [
    {
        'classifier__estimator': [
            XGBClassifier()
        ],
        'classifier__estimator__max_depth': [
            5, 10, 15, 20
        ],
        'classifier__estimator__n_estimators': [
            100, 150, 200, 250
        ]
    }
]

### Grid Search Initiazation

In [None]:
grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=grid_parameter, 
    scoring='accuracy', 
    cv=5, 
    verbose=2, 
    n_jobs=-2,
    refit=True
)

### Fit Grid Search
Fit the grid search on the full training set to select the best model hyper-parameter.

In [None]:
grid_search.fit(
    X_Set, y_Set
)
model = grid_search.best_estimator_

### Display Results of Grid Search

Display the grid search results after model training.

In [None]:
grid_search_results = pd.DataFrame(
    grid_search.cv_results_
)

In [None]:
grid_search_results

#### Plotting the Model Accuracy 

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(8)
sns.lineplot(
    x='param_classifier__estimator__max_depth',
    y='mean_test_score', 
    hue='param_classifier__estimator__n_estimators',
    data=grid_search_results,
    ax=ax
)
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles=handles[1:], 
    labels=labels[1:], 
    title="Number of Estimators"
);
ax.set_xlabel(
    'Max Depth'
);
ax.set_ylabel(
    'Mean Test Score'
);
ax.set_title(
    'XGBClassifier Model Accuracy'
);


#### Plotting Fit Time

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(8)
sns.lineplot(
    x='param_classifier__estimator__max_depth',
    y='mean_fit_time', 
    hue='param_classifier__estimator__n_estimators',
    data=grid_search_results,
    ax=ax
)
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles=handles[1:],
    labels=labels[1:], 
    title="Number of Estimators"
);
ax.set_xlabel(
    'Max Depth'
);
ax.set_ylabel(
    'Mean Fit Time (sec)'
);
ax.set_title(
    'XGBClassifier Model Fit Time'
);


### Predict on Validation Data


In [None]:
testing = files.upload()

In [None]:
validate_X = pd.read_csv(io.BytesIO(feature['testing_features.csv']), index_col='id')
  
validate_y = model.predict(validate_X)

predictions = pd.DataFrame(
    validate_y, 
    index=validate_X.index, 
    columns=['status_group']
)
predictions.to_csv('submission.csv',index=False)

In [None]:
files.download('submission.csv')