In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from matplotlib import pyplot

In [2]:
train_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TrainingData.csv',index_col = 'id')
train_lable = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TrainingSetLable.csv',index_col = 'id')

In [3]:
print(train_lable)

         status_group
id                   
69572      functional
8776       functional
34310      functional
67743  non functional
19728      functional
...               ...
60739      functional
27263      functional
37057      functional
31282      functional
26348      functional

[59400 rows x 1 columns]


In [4]:
random_seed = 40
drop_cols = []

In [5]:
df= train_dataset.join(train_lable,how='left')
X= df.drop('status_group',axis=1)
y= df['status_group']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state= random_seed)

In [None]:
X_train.head()

In [None]:
y_train.head

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (OneHotEncoder, StandardScaler, FunctionTransformer)

def convert_categorical_to_string(data):
    return pd.DataFrame(data).astype(str)

CategoricalTypeConverter = FunctionTransformer(convert_categorical_to_string)

In [9]:
def classify_columns(df, drop_cols):
    cols = df.columns
    keep_cols = [col for col in cols if col not in drop_cols]
    cat_cols = []
    num_cols = []
    for col in keep_cols:
        if df[col].dtype == object:
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols

In [10]:
cat_cols, num_cols = classify_columns(X_train, drop_cols)

In [11]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix

categorical_pipeline = Pipeline(
    steps=[
        (
            'typeConverter', 
            CategoricalTypeConverter
        ),
        (
            'imputer', 
            SimpleImputer(
                strategy='constant', 
                fill_value='missing'
            )
        ),
        (
            'standardizer', 
            OneHotEncoder(
                handle_unknown='ignore',
                dtype=float
            )
        )
    ]
)

In [12]:
numerical_pipeline = Pipeline(
    steps=[
        (
            'imputer', 
            SimpleImputer(
                strategy='median'
            )
        ),
        (
            'standardizer', 
            StandardScaler()
        )
    ]
)

In [13]:
if len(drop_cols) > 0:
    preprocessor = ColumnTransformer(
        transformers=[
            (
                'numericalPreprocessor', 
                numerical_pipeline, 
                num_cols
            ),
            (
                'categoricalPreprocessor', 
                categorical_pipeline, 
                cat_cols
            ),
            (
                'dropPreprocessor', 
                'drop', 
                drop_cols
            )
        ]
    )
else:
    preprocessor = ColumnTransformer(
        transformers=[
            (
                'numericalPreprocessor', 
                numerical_pipeline, 
                num_cols
            ),
            (
                'categoricalPreprocessor', 
                categorical_pipeline, 
                cat_cols
            )
        ]
    )

In [14]:
pipeline = Pipeline(
    steps=[
        (
            'preprocessor', 
            preprocessor
        ),
        (
            'classifier', 
            OneVsRestClassifier(
                estimator='passthrough'
            )
        )
    ]
)

In [15]:
parameter_grid = [
    {
        'classifier__estimator': [
            XGBClassifier()
        ],
        'classifier__estimator__max_depth': [20],
        'classifier__estimator__n_estimators': [200]
    }
]

In [16]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-2,refit=True)

In [None]:
grid_search.fit( X, y)
model = grid_search.best_estimator_

In [18]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)

In [None]:
grid_search_results

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(8)
sns.lineplot(x='param_classifier__estimator__max_depth',y='mean_test_score', hue='param_classifier__estimator__n_estimators',data=grid_search_results,ax=ax)
handles, labels = ax.get_legend_handles_labels()

ax.legend(handles=handles[1:], labels=labels[1:], title="Number of Estimators");
ax.set_xlabel('Max Depth');
ax.set_ylabel('Mean Test Score');
ax.set_title('XGBClassifier Model Accuracy');

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(8)
sns.lineplot(x='param_classifier__estimator__max_depth',y='mean_fit_time', hue='param_classifier__estimator__n_estimators',data=grid_search_results,ax=ax)
handles, labels = ax.get_legend_handles_labels()

ax.legend(handles=handles[1:],labels=labels[1:], title="Number of Estimators");
ax.set_xlabel('Max Depth');
ax.set_ylabel('Mean Fit Time (sec)');
ax.set_title('XGBClassifier Model Fit Time');


In [22]:
X_validate = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Testdata.csv', index_col='id')
y_validate = model.predict(X_validate)
df_predictions = pd.DataFrame( y_validate, index=X_validate.index, columns=['status_group'])
df_predictions.to_csv('/content/drive/MyDrive/Colab Notebooks/final_model.csv')

In [None]:
model.fit(X_train,y_train)

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(8)
fig.set_figwidth(8)
plot_confusion_matrix(model, X_test, y_test, ax=ax, normalize='true',include_values=True)