In [24]:
# general libraries
import pandas as pd
import numpy as np
import warnings
import time
import re
from tqdm import tqdm
from collections import Counter
import pickle
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import graphviz
from sklearn.tree import export_graphviz
from termcolor import colored
import plotly.graph_objects as go

# Outlier detection
# import imbalanced-learn
import scipy.stats as stats
from sklearn.ensemble import IsolationForest

# modelling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Undersampling
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import (SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE, SMOTEN)
from imblearn.under_sampling import (TomekLinks, NearMiss, AllKNN,
                                     EditedNearestNeighbours, 
                                     RepeatedEditedNearestNeighbours) 
from imblearn.combine import SMOTETomek, SMOTEENN

from sklearn.model_selection import (train_test_split, GridSearchCV,
                                     StratifiedKFold)

from sklearn.model_selection import cross_validate

# eval metrics
from sklearn.metrics import (precision_score, recall_score, f1_score)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jamie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1) Data Preparation

### 1.1) Import Data

In [None]:
df_raw = pd.read_csv(r"C:\Users\Jamie\esk\esk_sprint2\project_sprint2\data_exploration\phl_schp_deped_clean.csv")

In [None]:
print(df_raw.shape)
df_raw.head()

### 1.2) Final Columns

In [None]:
final_cols = ['school_type', 'region', 'province', 'legislative', 'division',  'total_enrollees', 'total_instructors',
'poverty_incidence_among_families', 'population_as_of_may_2020', 'unemployment_rate_per_region']

In [None]:
df = df_raw.copy(deep=True)

In [None]:
df = df[final_cols]

In [None]:
df.head()

In [None]:
to_drop = []

for col in tqdm(df.columns):
    if df[col].dtype == 'O': # if data is an object type
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=False) # create binary/dummy variable, using column name as the prefix
        df = pd.concat([df, dummies], axis=1) # 
        to_drop.append(col)

df = df.drop(to_drop, axis=1)
print(df.shape)
df.head()

### 1.3) Save data set

In [None]:
# df.to_csv(r"C:\Users\Jamie\esk\esk_sprint2\project_sprint2\data_exploration\final_ohe_data.csv")

In [2]:
df = pd.read_csv(r"C:\Users\Jamie\esk\esk_sprint2\project_sprint2\data_exploration\final_ohe_data.csv")

In [3]:
print(df.shape)
df.head()

(15037, 304)


Unnamed: 0.1,Unnamed: 0,school_type,total_enrollees,total_instructors,poverty_incidence_among_families,population_as_of_may_2020,unemployment_rate_per_region,region_ARMM,region_CAR,region_CARAGA,region_NCR,region_REGION I,region_REGION II,region_REGION III,region_REGION IVA,region_REGION IVB,region_REGION IX,region_REGION V,region_REGION VI,region_REGION VII,region_REGION VIII,region_REGION X,region_REGION XI,region_REGION XII,province_ABRA,province_AGUSAN DEL NORTE,province_AGUSAN DEL SUR,province_AKLAN,province_ALBAY,province_ANTIQUE,province_APAYAO,province_AURORA,province_BASILAN,province_BATAAN,province_BATANES,province_BATANGAS,province_BENGUET,province_BILIRAN,province_BOHOL,province_BUKIDNON,province_BULACAN,province_CAGAYAN,province_CAMARINES NORTE,province_CAMARINES SUR,province_CAMIGUIN,province_CAPIZ,province_CATANDUANES,province_CAVITE,province_CEBU,province_CITY OF COTABATO,province_COMPOSTELA VALLEY,province_DAVAO DEL NORTE,province_DAVAO DEL SUR,province_DAVAO ORIENTAL,province_DINAGAT ISLANDS,province_EASTERN SAMAR,province_GUIMARAS,province_IFUGAO,province_ILOCOS NORTE,province_ILOCOS SUR,province_ILOILO,province_ISABELA,province_KALINGA,province_LA UNION,province_LAGUNA,province_LANAO DEL NORTE,province_LANAO DEL SUR,province_LEYTE,province_MAGUINDANAO,"province_MANILA, NCR, FIRST DISTRICT",province_MARINDUQUE,province_MASBATE,province_MISAMIS OCCIDENTAL,province_MISAMIS ORIENTAL,province_MOUNTAIN PROVINCE,province_NCR FOURTH DISTRICT,province_NCR SECOND DISTRICT,province_NCR THIRD DISTRICT,province_NEGROS OCCIDENTAL,province_NEGROS ORIENTAL,province_NORTH COTABATO,province_NORTHERN SAMAR,province_NUEVA ECIJA,province_OCCIDENTAL MINDORO,province_ORIENTAL MINDORO,province_PALAWAN,province_PAMPANGA,province_PANGASINAN,province_QUEZON,province_QUIRINO,province_RIZAL,province_ROMBLON,province_SARANGANI,province_SIQUIJOR,province_SORSOGON,province_SOUTH COTABATO,province_SOUTHERN LEYTE,province_SULTAN KUDARAT,province_SULU,province_SURIGAO DEL NORTE,province_SURIGAO DEL SUR,province_TARLAC,province_TAWI-TAWI,province_WESTERN SAMAR,province_ZAMBALES,province_ZAMBOANGA DEL NORTE,province_ZAMBOANGA DEL SUR,province_ZAMBOANGA SIBUGAY,legislative_1ST,legislative_2ND,legislative_3RD,legislative_4TH,legislative_5TH,legislative_6TH,legislative_7TH,legislative_LONE,division_ABRA,division_AGUSAN DEL NORTE,division_AGUSAN DEL SUR,division_AKLAN,division_ALAMINOS CITY,division_ALBAY,division_ANGELES CITY,division_ANTIPOLO CITY,division_ANTIQUE,division_APAYAO,division_AURORA,division_BACOLOD CITY,division_BAGO CITY,division_BAGUIO CITY,division_BAIS CITY,division_BALANGA CITY,division_BASILAN,division_BATAAN,division_BATANES,division_BATANGAS,division_BATANGAS CITY,division_BAYAWAN CITY,division_BENGUET,division_BILIRAN,division_BISLIG CITY,division_BOGO CITY,division_BOHOL,division_BUKIDNON,division_BULACAN,division_BUTUAN CITY,division_CABANATUAN CITY,division_CADIZ CITY,division_CAGAYAN,division_CAGAYAN DE ORO CITY,division_CALAMBA CITY,division_CALAPAN CITY,division_CALBAYOG CITY,division_CALOOCAN CITY,division_CAMARINES NORTE,division_CAMARINES SUR,division_CAMIGUIN,division_CANDON CITY,division_CAPIZ,division_CARCAR CITY,division_CATANDUANES,division_CAUAYAN CITY,division_CAVITE,division_CAVITE CITY,division_CEBU,division_CEBU CITY,division_CITY OF NAGA CEBU,division_CITY OF SAN JUAN,division_COMPOSTELA VALLEY,division_COTABATO CITY,division_DAGUPAN CITY,division_DANAO CITY,division_DASMARINAS CITY,division_DAVAO CITY,division_DAVAO DEL NORTE,division_DAVAO DEL SUR,division_DAVAO ORIENTAL,division_DIGOS CITY,division_DINAGAT ISLAND,division_DIPOLOG CITY,division_DUMAGUETE CITY,division_EASTERN SAMAR,division_ESCALANTE CITY,division_GAPAN CITY,division_GENERAL SANTOS CITY,division_GINGOOG CITY,division_GUIHULNGAN CITY,division_GUIMARAS,division_IFUGAO,division_ILIGAN CITY,division_ILOCOS NORTE,division_ILOCOS SUR,division_ILOILO,division_ILOILO CITY,division_IRIGA CITY,division_ISABELA,division_ISLAND GARDEN CITY OF SAMAL,division_KABANKALAN CITY,division_KALINGA,division_KIDAPAWAN CITY,division_KORONADAL CITY,division_LA UNION,division_LAGUNA,division_LAMITAN CITY,division_LANAO DEL NORTE,division_LANAO DEL SUR IA,division_LANAO DEL SUR IB,division_LANAO DEL SUR IIA,division_LANAO DEL SUR IIB,division_LAOAG CITY,division_LAPULAPU CITY,division_LAS PINAS CITY,division_LEGASPI CITY,division_LEYTE,division_LIGAO CITY,division_LIPA CITY,division_LUCENA CITY,division_MAGUINDANAO I,division_MAGUINDANAO II,division_MAKATI CITY,division_MALABON CITY,division_MALAYBALAY CITY,division_MANDALUYONG CITY,division_MANDAUE CITY,division_MANILA,division_MARAWI CITY,division_MARIKINA CITY,division_MARINDUQUE,division_MASBATE,division_MASBATE CITY,division_MISAMIS OCCIDENTAL,division_MISAMIS ORIENTAL,division_MT PROVINCE,division_MUNOZ SCIENCE CITY,division_MUNTINLUPA CITY,division_NAGA CITY,division_NAVOTAS,division_NEGROS OCCIDENTAL,division_NEGROS ORIENTAL,division_NORTH COTABATO,division_NORTHERN SAMAR,division_NUEVA ECIJA,division_OCCIDENTAL MINDORO,division_OLONGAPO CITY,division_ORIENTAL MINDORO,division_ORMOC CITY,division_OZAMIS CITY,division_PALAWAN,division_PAMPANGA,division_PANABO CITY,division_PANGASINAN I LINGAYEN,division_PANGASINAN II BINALONAN,division_PARANAQUE CITY,division_PASAY CITY,division_PASIG CITY,division_PASSI CITY,division_PUERTO PRINCESA CITY,division_QUEZON,division_QUEZON CITY,division_QUIRINO,division_RIZAL,division_ROMBLON,division_ROXAS CITY,division_SAGAY CITY,division_SAMAR WESTERN SAMAR,division_SAN CARLOS CITY,division_SAN FERNANDO CITY,division_SAN JOSE CITY,division_SAN JOSE DEL MONTE CITY,division_SAN PABLO CITY,division_SANTIAGO CITY,division_SARANGANI,division_SIARGAO,division_SILAY CITY,division_SIQUIJOR,division_SORSOGON,division_SORSOGON CITY,division_SOUTH COTABATO,division_SOUTHERN LEYTE,division_STA ROSA CITY,division_SULTAN KUDARAT,division_SULU I,division_SULU II,division_SURIGAO CITY,division_SURIGAO DEL NORTE,division_SURIGAO DEL SUR,division_TABACO CITY,division_TAGUIG,division_TAGUM CITY,division_TALISAY CITY,division_TANAUAN CITY,division_TANGUB CITY,division_TANJAY CITY,division_TARLAC,division_TARLAC CITY,division_TAWITAWI,division_TUGUEGARAO CITY,division_URDANETA CITY,division_VALENCIA CITY,division_VALENZUELA CITY,division_VIGAN CITY,division_ZAMBALES,division_ZAMBOANGA DEL SUR,division_ZAMBOANGA SIBUGAY
0,0,0,1587,14,0.697243,3861951,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,2681,22,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1,1223,13,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,1,3347,30,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,1,2555,39,1.426553,4771371,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
df.drop(labels='Unnamed: 0', axis=1, inplace=True)
print(df.shape)
df.head()

(15037, 303)


Unnamed: 0,school_type,total_enrollees,total_instructors,poverty_incidence_among_families,population_as_of_may_2020,unemployment_rate_per_region,region_ARMM,region_CAR,region_CARAGA,region_NCR,region_REGION I,region_REGION II,region_REGION III,region_REGION IVA,region_REGION IVB,region_REGION IX,region_REGION V,region_REGION VI,region_REGION VII,region_REGION VIII,region_REGION X,region_REGION XI,region_REGION XII,province_ABRA,province_AGUSAN DEL NORTE,province_AGUSAN DEL SUR,province_AKLAN,province_ALBAY,province_ANTIQUE,province_APAYAO,province_AURORA,province_BASILAN,province_BATAAN,province_BATANES,province_BATANGAS,province_BENGUET,province_BILIRAN,province_BOHOL,province_BUKIDNON,province_BULACAN,province_CAGAYAN,province_CAMARINES NORTE,province_CAMARINES SUR,province_CAMIGUIN,province_CAPIZ,province_CATANDUANES,province_CAVITE,province_CEBU,province_CITY OF COTABATO,province_COMPOSTELA VALLEY,province_DAVAO DEL NORTE,province_DAVAO DEL SUR,province_DAVAO ORIENTAL,province_DINAGAT ISLANDS,province_EASTERN SAMAR,province_GUIMARAS,province_IFUGAO,province_ILOCOS NORTE,province_ILOCOS SUR,province_ILOILO,province_ISABELA,province_KALINGA,province_LA UNION,province_LAGUNA,province_LANAO DEL NORTE,province_LANAO DEL SUR,province_LEYTE,province_MAGUINDANAO,"province_MANILA, NCR, FIRST DISTRICT",province_MARINDUQUE,province_MASBATE,province_MISAMIS OCCIDENTAL,province_MISAMIS ORIENTAL,province_MOUNTAIN PROVINCE,province_NCR FOURTH DISTRICT,province_NCR SECOND DISTRICT,province_NCR THIRD DISTRICT,province_NEGROS OCCIDENTAL,province_NEGROS ORIENTAL,province_NORTH COTABATO,province_NORTHERN SAMAR,province_NUEVA ECIJA,province_OCCIDENTAL MINDORO,province_ORIENTAL MINDORO,province_PALAWAN,province_PAMPANGA,province_PANGASINAN,province_QUEZON,province_QUIRINO,province_RIZAL,province_ROMBLON,province_SARANGANI,province_SIQUIJOR,province_SORSOGON,province_SOUTH COTABATO,province_SOUTHERN LEYTE,province_SULTAN KUDARAT,province_SULU,province_SURIGAO DEL NORTE,province_SURIGAO DEL SUR,province_TARLAC,province_TAWI-TAWI,province_WESTERN SAMAR,province_ZAMBALES,province_ZAMBOANGA DEL NORTE,province_ZAMBOANGA DEL SUR,province_ZAMBOANGA SIBUGAY,legislative_1ST,legislative_2ND,legislative_3RD,legislative_4TH,legislative_5TH,legislative_6TH,legislative_7TH,legislative_LONE,division_ABRA,division_AGUSAN DEL NORTE,division_AGUSAN DEL SUR,division_AKLAN,division_ALAMINOS CITY,division_ALBAY,division_ANGELES CITY,division_ANTIPOLO CITY,division_ANTIQUE,division_APAYAO,division_AURORA,division_BACOLOD CITY,division_BAGO CITY,division_BAGUIO CITY,division_BAIS CITY,division_BALANGA CITY,division_BASILAN,division_BATAAN,division_BATANES,division_BATANGAS,division_BATANGAS CITY,division_BAYAWAN CITY,division_BENGUET,division_BILIRAN,division_BISLIG CITY,division_BOGO CITY,division_BOHOL,division_BUKIDNON,division_BULACAN,division_BUTUAN CITY,division_CABANATUAN CITY,division_CADIZ CITY,division_CAGAYAN,division_CAGAYAN DE ORO CITY,division_CALAMBA CITY,division_CALAPAN CITY,division_CALBAYOG CITY,division_CALOOCAN CITY,division_CAMARINES NORTE,division_CAMARINES SUR,division_CAMIGUIN,division_CANDON CITY,division_CAPIZ,division_CARCAR CITY,division_CATANDUANES,division_CAUAYAN CITY,division_CAVITE,division_CAVITE CITY,division_CEBU,division_CEBU CITY,division_CITY OF NAGA CEBU,division_CITY OF SAN JUAN,division_COMPOSTELA VALLEY,division_COTABATO CITY,division_DAGUPAN CITY,division_DANAO CITY,division_DASMARINAS CITY,division_DAVAO CITY,division_DAVAO DEL NORTE,division_DAVAO DEL SUR,division_DAVAO ORIENTAL,division_DIGOS CITY,division_DINAGAT ISLAND,division_DIPOLOG CITY,division_DUMAGUETE CITY,division_EASTERN SAMAR,division_ESCALANTE CITY,division_GAPAN CITY,division_GENERAL SANTOS CITY,division_GINGOOG CITY,division_GUIHULNGAN CITY,division_GUIMARAS,division_IFUGAO,division_ILIGAN CITY,division_ILOCOS NORTE,division_ILOCOS SUR,division_ILOILO,division_ILOILO CITY,division_IRIGA CITY,division_ISABELA,division_ISLAND GARDEN CITY OF SAMAL,division_KABANKALAN CITY,division_KALINGA,division_KIDAPAWAN CITY,division_KORONADAL CITY,division_LA UNION,division_LAGUNA,division_LAMITAN CITY,division_LANAO DEL NORTE,division_LANAO DEL SUR IA,division_LANAO DEL SUR IB,division_LANAO DEL SUR IIA,division_LANAO DEL SUR IIB,division_LAOAG CITY,division_LAPULAPU CITY,division_LAS PINAS CITY,division_LEGASPI CITY,division_LEYTE,division_LIGAO CITY,division_LIPA CITY,division_LUCENA CITY,division_MAGUINDANAO I,division_MAGUINDANAO II,division_MAKATI CITY,division_MALABON CITY,division_MALAYBALAY CITY,division_MANDALUYONG CITY,division_MANDAUE CITY,division_MANILA,division_MARAWI CITY,division_MARIKINA CITY,division_MARINDUQUE,division_MASBATE,division_MASBATE CITY,division_MISAMIS OCCIDENTAL,division_MISAMIS ORIENTAL,division_MT PROVINCE,division_MUNOZ SCIENCE CITY,division_MUNTINLUPA CITY,division_NAGA CITY,division_NAVOTAS,division_NEGROS OCCIDENTAL,division_NEGROS ORIENTAL,division_NORTH COTABATO,division_NORTHERN SAMAR,division_NUEVA ECIJA,division_OCCIDENTAL MINDORO,division_OLONGAPO CITY,division_ORIENTAL MINDORO,division_ORMOC CITY,division_OZAMIS CITY,division_PALAWAN,division_PAMPANGA,division_PANABO CITY,division_PANGASINAN I LINGAYEN,division_PANGASINAN II BINALONAN,division_PARANAQUE CITY,division_PASAY CITY,division_PASIG CITY,division_PASSI CITY,division_PUERTO PRINCESA CITY,division_QUEZON,division_QUEZON CITY,division_QUIRINO,division_RIZAL,division_ROMBLON,division_ROXAS CITY,division_SAGAY CITY,division_SAMAR WESTERN SAMAR,division_SAN CARLOS CITY,division_SAN FERNANDO CITY,division_SAN JOSE CITY,division_SAN JOSE DEL MONTE CITY,division_SAN PABLO CITY,division_SANTIAGO CITY,division_SARANGANI,division_SIARGAO,division_SILAY CITY,division_SIQUIJOR,division_SORSOGON,division_SORSOGON CITY,division_SOUTH COTABATO,division_SOUTHERN LEYTE,division_STA ROSA CITY,division_SULTAN KUDARAT,division_SULU I,division_SULU II,division_SURIGAO CITY,division_SURIGAO DEL NORTE,division_SURIGAO DEL SUR,division_TABACO CITY,division_TAGUIG,division_TAGUM CITY,division_TALISAY CITY,division_TANAUAN CITY,division_TANGUB CITY,division_TANJAY CITY,division_TARLAC,division_TARLAC CITY,division_TAWITAWI,division_TUGUEGARAO CITY,division_URDANETA CITY,division_VALENCIA CITY,division_VALENZUELA CITY,division_VIGAN CITY,division_ZAMBALES,division_ZAMBOANGA DEL SUR,division_ZAMBOANGA SIBUGAY
0,0,1587,14,0.697243,3861951,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2681,22,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1223,13,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,3347,30,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,2555,39,1.426553,4771371,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


-------------

# 2) Modeling
- Logistic
- Decision Tree

In [5]:
X = df.drop(['school_type'], axis=1)
y = df['school_type']

(X_trainval, X_holdout, y_trainval, y_holdout) = train_test_split(X, y, 
                                                                  random_state=890,
                                                                  test_size=0.25,
                                                                  stratify=y)

In [6]:
print(X.shape)
X.head()

(15037, 302)


Unnamed: 0,total_enrollees,total_instructors,poverty_incidence_among_families,population_as_of_may_2020,unemployment_rate_per_region,region_ARMM,region_CAR,region_CARAGA,region_NCR,region_REGION I,region_REGION II,region_REGION III,region_REGION IVA,region_REGION IVB,region_REGION IX,region_REGION V,region_REGION VI,region_REGION VII,region_REGION VIII,region_REGION X,region_REGION XI,region_REGION XII,province_ABRA,province_AGUSAN DEL NORTE,province_AGUSAN DEL SUR,province_AKLAN,province_ALBAY,province_ANTIQUE,province_APAYAO,province_AURORA,province_BASILAN,province_BATAAN,province_BATANES,province_BATANGAS,province_BENGUET,province_BILIRAN,province_BOHOL,province_BUKIDNON,province_BULACAN,province_CAGAYAN,province_CAMARINES NORTE,province_CAMARINES SUR,province_CAMIGUIN,province_CAPIZ,province_CATANDUANES,province_CAVITE,province_CEBU,province_CITY OF COTABATO,province_COMPOSTELA VALLEY,province_DAVAO DEL NORTE,province_DAVAO DEL SUR,province_DAVAO ORIENTAL,province_DINAGAT ISLANDS,province_EASTERN SAMAR,province_GUIMARAS,province_IFUGAO,province_ILOCOS NORTE,province_ILOCOS SUR,province_ILOILO,province_ISABELA,province_KALINGA,province_LA UNION,province_LAGUNA,province_LANAO DEL NORTE,province_LANAO DEL SUR,province_LEYTE,province_MAGUINDANAO,"province_MANILA, NCR, FIRST DISTRICT",province_MARINDUQUE,province_MASBATE,province_MISAMIS OCCIDENTAL,province_MISAMIS ORIENTAL,province_MOUNTAIN PROVINCE,province_NCR FOURTH DISTRICT,province_NCR SECOND DISTRICT,province_NCR THIRD DISTRICT,province_NEGROS OCCIDENTAL,province_NEGROS ORIENTAL,province_NORTH COTABATO,province_NORTHERN SAMAR,province_NUEVA ECIJA,province_OCCIDENTAL MINDORO,province_ORIENTAL MINDORO,province_PALAWAN,province_PAMPANGA,province_PANGASINAN,province_QUEZON,province_QUIRINO,province_RIZAL,province_ROMBLON,province_SARANGANI,province_SIQUIJOR,province_SORSOGON,province_SOUTH COTABATO,province_SOUTHERN LEYTE,province_SULTAN KUDARAT,province_SULU,province_SURIGAO DEL NORTE,province_SURIGAO DEL SUR,province_TARLAC,province_TAWI-TAWI,province_WESTERN SAMAR,province_ZAMBALES,province_ZAMBOANGA DEL NORTE,province_ZAMBOANGA DEL SUR,province_ZAMBOANGA SIBUGAY,legislative_1ST,legislative_2ND,legislative_3RD,legislative_4TH,legislative_5TH,legislative_6TH,legislative_7TH,legislative_LONE,division_ABRA,division_AGUSAN DEL NORTE,division_AGUSAN DEL SUR,division_AKLAN,division_ALAMINOS CITY,division_ALBAY,division_ANGELES CITY,division_ANTIPOLO CITY,division_ANTIQUE,division_APAYAO,division_AURORA,division_BACOLOD CITY,division_BAGO CITY,division_BAGUIO CITY,division_BAIS CITY,division_BALANGA CITY,division_BASILAN,division_BATAAN,division_BATANES,division_BATANGAS,division_BATANGAS CITY,division_BAYAWAN CITY,division_BENGUET,division_BILIRAN,division_BISLIG CITY,division_BOGO CITY,division_BOHOL,division_BUKIDNON,division_BULACAN,division_BUTUAN CITY,division_CABANATUAN CITY,division_CADIZ CITY,division_CAGAYAN,division_CAGAYAN DE ORO CITY,division_CALAMBA CITY,division_CALAPAN CITY,division_CALBAYOG CITY,division_CALOOCAN CITY,division_CAMARINES NORTE,division_CAMARINES SUR,division_CAMIGUIN,division_CANDON CITY,division_CAPIZ,division_CARCAR CITY,division_CATANDUANES,division_CAUAYAN CITY,division_CAVITE,division_CAVITE CITY,division_CEBU,division_CEBU CITY,division_CITY OF NAGA CEBU,division_CITY OF SAN JUAN,division_COMPOSTELA VALLEY,division_COTABATO CITY,division_DAGUPAN CITY,division_DANAO CITY,division_DASMARINAS CITY,division_DAVAO CITY,division_DAVAO DEL NORTE,division_DAVAO DEL SUR,division_DAVAO ORIENTAL,division_DIGOS CITY,division_DINAGAT ISLAND,division_DIPOLOG CITY,division_DUMAGUETE CITY,division_EASTERN SAMAR,division_ESCALANTE CITY,division_GAPAN CITY,division_GENERAL SANTOS CITY,division_GINGOOG CITY,division_GUIHULNGAN CITY,division_GUIMARAS,division_IFUGAO,division_ILIGAN CITY,division_ILOCOS NORTE,division_ILOCOS SUR,division_ILOILO,division_ILOILO CITY,division_IRIGA CITY,division_ISABELA,division_ISLAND GARDEN CITY OF SAMAL,division_KABANKALAN CITY,division_KALINGA,division_KIDAPAWAN CITY,division_KORONADAL CITY,division_LA UNION,division_LAGUNA,division_LAMITAN CITY,division_LANAO DEL NORTE,division_LANAO DEL SUR IA,division_LANAO DEL SUR IB,division_LANAO DEL SUR IIA,division_LANAO DEL SUR IIB,division_LAOAG CITY,division_LAPULAPU CITY,division_LAS PINAS CITY,division_LEGASPI CITY,division_LEYTE,division_LIGAO CITY,division_LIPA CITY,division_LUCENA CITY,division_MAGUINDANAO I,division_MAGUINDANAO II,division_MAKATI CITY,division_MALABON CITY,division_MALAYBALAY CITY,division_MANDALUYONG CITY,division_MANDAUE CITY,division_MANILA,division_MARAWI CITY,division_MARIKINA CITY,division_MARINDUQUE,division_MASBATE,division_MASBATE CITY,division_MISAMIS OCCIDENTAL,division_MISAMIS ORIENTAL,division_MT PROVINCE,division_MUNOZ SCIENCE CITY,division_MUNTINLUPA CITY,division_NAGA CITY,division_NAVOTAS,division_NEGROS OCCIDENTAL,division_NEGROS ORIENTAL,division_NORTH COTABATO,division_NORTHERN SAMAR,division_NUEVA ECIJA,division_OCCIDENTAL MINDORO,division_OLONGAPO CITY,division_ORIENTAL MINDORO,division_ORMOC CITY,division_OZAMIS CITY,division_PALAWAN,division_PAMPANGA,division_PANABO CITY,division_PANGASINAN I LINGAYEN,division_PANGASINAN II BINALONAN,division_PARANAQUE CITY,division_PASAY CITY,division_PASIG CITY,division_PASSI CITY,division_PUERTO PRINCESA CITY,division_QUEZON,division_QUEZON CITY,division_QUIRINO,division_RIZAL,division_ROMBLON,division_ROXAS CITY,division_SAGAY CITY,division_SAMAR WESTERN SAMAR,division_SAN CARLOS CITY,division_SAN FERNANDO CITY,division_SAN JOSE CITY,division_SAN JOSE DEL MONTE CITY,division_SAN PABLO CITY,division_SANTIAGO CITY,division_SARANGANI,division_SIARGAO,division_SILAY CITY,division_SIQUIJOR,division_SORSOGON,division_SORSOGON CITY,division_SOUTH COTABATO,division_SOUTHERN LEYTE,division_STA ROSA CITY,division_SULTAN KUDARAT,division_SULU I,division_SULU II,division_SURIGAO CITY,division_SURIGAO DEL NORTE,division_SURIGAO DEL SUR,division_TABACO CITY,division_TAGUIG,division_TAGUM CITY,division_TALISAY CITY,division_TANAUAN CITY,division_TANGUB CITY,division_TANJAY CITY,division_TARLAC,division_TARLAC CITY,division_TAWITAWI,division_TUGUEGARAO CITY,division_URDANETA CITY,division_VALENCIA CITY,division_VALENZUELA CITY,division_VIGAN CITY,division_ZAMBALES,division_ZAMBOANGA DEL SUR,division_ZAMBOANGA SIBUGAY
0,1587,14,0.697243,3861951,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2681,22,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1223,13,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3347,30,2.177679,3004627,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2555,39,1.426553,4771371,11.657282,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
print(y.shape)
y.head()

(15037,)


0    0
1    1
2    1
3    1
4    1
Name: school_type, dtype: int64

In [8]:
def auto_ml(X, y, models_dict, scaler=None, cv=None, res_t=None):
    """Apply resampling for imbalanced data"""
    results = {}
    results2 = {}

    # log start time
    total_start = time.time()

    for model_name, model in tqdm(models_dict.items()):
        
        ################# this portion can be edited ###############
        train_scores = []
        val_scores = []

        train_recall = []
        val_recall = []

        ###########################################################

        for train_index, val_index in tqdm(cv.split(X, y)):

            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Apply scaling
            if scaler is not None:
                X_train['population_as_of_may_2020'] = scaler.fit_transform(X_train['population_as_of_may_2020'].values.reshape(-1, 1))
                X_val['population_as_of_may_2020']   = scaler.fit_transform(X_val['population_as_of_may_2020'].values.reshape(-1, 1))
#                 X_train = scaler.fit_transform(X_train)
#                 X_val   = scaler.transform(X_val)
            else:
                pass

            # Apply resampling
            if res_t is not None:
                s = time.time()
                X_train, y_train = res_t.fit_resample(X_train, y_train)
                print(f'Resampling done in {time.time() - s}')
            else:
                pass

            start_time = time.time()

            # fit
            model.fit(X_train, y_train)

            # predict
            train_preds = model.predict(X_train)
            val_preds = model.predict(X_val)
            
            ################# this portion can be edited ###############

            # classification accuracy
            train_scores.append(model.score(X_train, y_train))
            val_scores.append(model.score(X_val, y_val))

            # recall
            train_recall.append(recall_score(y_train, train_preds))
            val_recall.append(recall_score(y_val, val_preds))

            end_time = time.time()

            results[model_name] = {
                'Train Accuracy': np.round(np.mean(train_scores) * 100, 2),
                'Val Accuracy': np.round(np.mean(val_scores) * 100, 2),
                'Train Recall': np.round(np.mean(train_recall) * 100, 2),
                'Val Recall': np.round(np.mean(val_recall) * 100, 2),
                'Run Time': end_time - start_time
            }

            results2[model_name] = {
                'Train Accuracy': '{:.2f}%'.format(np.round(np.mean(train_scores)*100, 2)),
                'Val Accuracy': '{:.2f}%'.format(np.round(np.mean(val_scores)*100, 2)),
                'Train Recall': '{:.2f}%'.format(np.round(np.mean(train_recall)*100, 2)),
                'Val Recall': '{:.2f}%'.format(np.round(np.mean(val_recall)*100, 2)),
                'Run Time': end_time - start_time
            }
            
            ###########################################################

    results = pd.DataFrame(results).T
    results2 = pd.DataFrame(results2).T
    return [results, results2]

In [9]:
models_dict = {'DecisionTreeClassifier': DecisionTreeClassifier(random_state=890, max_depth = 5)}

### BASELINE

In [10]:
auto_run = auto_ml(X_trainval, y_trainval, models_dict,
                   scaler=StandardScaler(),
                   cv=StratifiedKFold(n_splits=5))
auto = auto_run[0]
auto_run[1]

  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.03it/s][A
2it [00:01,  1.81it/s][A
3it [00:01,  2.35it/s][A
4it [00:01,  2.78it/s][A
5it [00:02,  2.47it/s][A
100%|██████████| 1/1 [00:02<00:00,  2.03s/it]


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
DecisionTreeClassifier,0.178191,98.93%,99.97%,98.70%,99.82%


### WITH SAMPLING

In [11]:
res_list = [('Random UnderSamp', RandomUnderSampler(random_state=890)),
            ('Random OverSamp', RandomOverSampler(random_state=890)),
            ('TomekLinks', TomekLinks(sampling_strategy='not majority')), #undersample
            ('NearMiss', NearMiss())]


outputs = []
for title, res in res_list:
    print(colored(title, 'red', attrs=['bold']).center(120, "-"))
    
    var = auto_ml(X_trainval, y_trainval,
                  models_dict,
                  scaler=StandardScaler(),
                  cv=StratifiedKFold(n_splits=5), res_t=res)
    
    outputs.append(var[0])
    display(var[1])

---------------------------------------------[1m[31mRandom UnderSamp[0m----------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

Resampling done in 0.07578849792480469



1it [00:00,  4.91it/s][A

Resampling done in 0.057917118072509766



2it [00:00,  5.29it/s][A

Resampling done in 0.055014610290527344



3it [00:00,  5.44it/s][A

Resampling done in 0.061304330825805664



4it [00:00,  5.36it/s][A

Resampling done in 0.06740593910217285


5it [00:00,  5.27it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.05it/s]







Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
DecisionTreeClassifier,0.042578,77.59%,84.44%,69.98%,70.16%


----------------------------------------------[1m[31mRandom OverSamp[0m----------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

Resampling done in 0.24759817123413086



1it [00:00,  1.52it/s][A

Resampling done in 0.23944520950317383



2it [00:01,  1.51it/s][A

Resampling done in 0.24400639533996582



3it [00:02,  1.47it/s][A

Resampling done in 0.18718242645263672



4it [00:02,  1.50it/s][A

Resampling done in 0.22448945045471191



5it [00:03,  1.49it/s][A
100%|██████████| 1/1 [00:03<00:00,  3.35s/it]


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
DecisionTreeClassifier,0.369241,72.72%,54.94%,55.01%,54.76%


------------------------------------------------[1m[31mTomekLinks[0m-------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

Resampling done in 2.647775173187256



1it [00:02,  2.93s/it][A

Resampling done in 2.213833808898926



2it [00:05,  2.70s/it][A


Resampling done in 2.1242568492889404


3it [00:07,  2.57s/it][A

Resampling done in 2.069859743118286



4it [00:10,  2.49s/it][A


Resampling done in 2.165234088897705


5it [00:12,  2.54s/it][A
100%|██████████| 1/1 [00:12<00:00, 12.71s/it]


Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
DecisionTreeClassifier,0.218819,99.44%,99.98%,98.74%,99.90%


-------------------------------------------------[1m[31mNearMiss[0m--------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  3.56it/s][A

Resampling done in 0.1394963264465332
Resampling done in 0.13414597511291504



2it [00:00,  3.70it/s][A
3it [00:00,  3.73it/s][A

Resampling done in 0.14177799224853516



4it [00:01,  3.51it/s][A

Resampling done in 0.1735067367553711



5it [00:01,  3.58it/s][A
100%|██████████| 1/1 [00:01<00:00,  1.40s/it]

Resampling done in 0.14746308326721191





Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall
DecisionTreeClassifier,0.054135,87.22%,94.63%,24.30%,23.71%


In [12]:
perfs = pd.DataFrame()

In [13]:
auto_run[0]['Type'] = 'Base'
perfs = pd.concat([auto_run[0], perfs.loc[:]]).reset_index()
perfs.drop(labels=['index'], axis=1, inplace=True)
perfs

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall,Type
0,0.178191,98.93,99.97,98.7,99.82,Base


In [14]:
outputs[0]['Type'] = 'Random Under Sampling'
perfs = pd.concat([outputs[0], perfs.loc[-1:]]).reset_index()
perfs.drop(labels=['index'], axis=1, inplace=True)
perfs

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall,Type
0,0.042578,77.59,84.44,69.98,70.16,Random Under Sampling
1,0.178191,98.93,99.97,98.7,99.82,Base


In [15]:
outputs[1]['Type'] = 'TomekLinks'
perfs = pd.concat([outputs[1], perfs.loc[:]]).reset_index()
perfs.drop(labels=['index'], axis=1, inplace=True)
perfs

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall,Type
0,0.369241,72.72,54.94,55.01,54.76,TomekLinks
1,0.042578,77.59,84.44,69.98,70.16,Random Under Sampling
2,0.178191,98.93,99.97,98.7,99.82,Base


In [16]:
outputs[2]['Type'] = 'NearMiss'
perfs = pd.concat([outputs[1], perfs.loc[:]]).reset_index()
perfs.drop(labels=['index'], axis=1, inplace=True)
perfs

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall,Type
0,0.369241,72.72,54.94,55.01,54.76,TomekLinks
1,0.369241,72.72,54.94,55.01,54.76,TomekLinks
2,0.042578,77.59,84.44,69.98,70.16,Random Under Sampling
3,0.178191,98.93,99.97,98.7,99.82,Base


In [17]:
perfs['Acc Difference'] = perfs['Train Accuracy'] - perfs['Val Accuracy']
perfs['Recall Difference'] = perfs['Train Recall'] - perfs['Val Recall']
perfs

Unnamed: 0,Run Time,Train Accuracy,Train Recall,Val Accuracy,Val Recall,Type,Acc Difference,Recall Difference
0,0.369241,72.72,54.94,55.01,54.76,TomekLinks,17.71,0.18
1,0.369241,72.72,54.94,55.01,54.76,TomekLinks,17.71,0.18
2,0.042578,77.59,84.44,69.98,70.16,Random Under Sampling,7.61,14.28
3,0.178191,98.93,99.97,98.7,99.82,Base,0.23,0.15


------------

### Hypertune

In [18]:
def train_dt(X, y, metric, res_t=None, hypertune=False, resample=False):
    params = {"max_depth": range(1, 11)
             }
    new_params = {'decisiontreeclassifier__' +
                  key: params[key] for key in params}

    skf = StratifiedKFold(n_splits=5)
    
    if resample is True:
        imba_pipeline = make_pipeline(StandardScaler(),
                                      res_t,
                                      DecisionTreeClassifier(random_state=890))
    else:
        imba_pipeline = make_pipeline(StandardScaler(),
                                      DecisionTreeClassifier(random_state=890))

    if hypertune is True:
        clf = GridSearchCV(imba_pipeline, param_grid=new_params,
                           cv=skf, scoring=metric, return_train_score=True,
                          verbose=10)
        start_time = time.time()
        clf.fit(X, y)
        run_time = (time.time() - start_time)
        model = clf.best_estimator_.fit(X, y)
        return ['DecisionTreeClassifier', np.mean(clf.cv_results_['mean_train_score']),
                clf.best_score_, 'Best parameters = {0}'.format(
                    clf.best_params_),
                run_time, model]
    else:
        clf = imba_pipeline
        start_time = time.time()
        clf.fit(X, y)
        run_time = (time.time() - start_time)
        cv_scores = cross_validate(estimator=clf, X=X, y=y,
                                   scoring=metric, cv=skf, n_jobs=-1,
                                   return_train_score=True)

        return ['DecisionTreeClassifier', np.mean(cv_scores['train_score']),
                np.mean(cv_scores['test_score']), run_time, clf]

In [19]:
start_time = time.time()
dt_tk_rec = train_dt(X_trainval, y_trainval, 'recall', RandomUnderSampler(sampling_strategy = 'majority', random_state=890),
                       hypertune=True, resample=True)

start_time = time.time()
dt_tk_acc = train_dt(X_trainval, y_trainval, 'accuracy', RandomUnderSampler(sampling_strategy = 'majority', random_state=890),
                      hypertune=True, resample=True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 1/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.906, test=0.906) total time=   0.0s
[CV 2/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 2/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.906, test=0.907) total time=   0.0s
[CV 3/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 3/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.098, test=0.093) total time=   0.0s
[CV 4/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 4/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.988, test=0.987) total time=   0.0s
[CV 5/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 5/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.988, test=0.986) total time=  

[CV 4/5; 9/10] END decisiontreeclassifier__max_depth=9;, score=(train=0.452, test=0.451) total time=   0.0s
[CV 5/5; 9/10] START decisiontreeclassifier__max_depth=9........................
[CV 5/5; 9/10] END decisiontreeclassifier__max_depth=9;, score=(train=0.572, test=0.579) total time=   0.0s
[CV 1/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 1/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.669, test=0.657) total time=   0.0s
[CV 2/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 2/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.716, test=0.729) total time=   0.0s
[CV 3/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 3/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.693, test=0.681) total time=   0.1s
[CV 4/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 4/5; 10/10] END decisiontreeclassifier__max_d

[CV 3/5; 8/10] END decisiontreeclassifier__max_depth=8;, score=(train=0.632, test=0.621) total time=   0.0s
[CV 4/5; 8/10] START decisiontreeclassifier__max_depth=8........................
[CV 4/5; 8/10] END decisiontreeclassifier__max_depth=8;, score=(train=0.847, test=0.842) total time=   0.0s
[CV 5/5; 8/10] START decisiontreeclassifier__max_depth=8........................
[CV 5/5; 8/10] END decisiontreeclassifier__max_depth=8;, score=(train=0.597, test=0.614) total time=   0.0s
[CV 1/5; 9/10] START decisiontreeclassifier__max_depth=9........................
[CV 1/5; 9/10] END decisiontreeclassifier__max_depth=9;, score=(train=0.653, test=0.643) total time=   0.0s
[CV 2/5; 9/10] START decisiontreeclassifier__max_depth=9........................
[CV 2/5; 9/10] END decisiontreeclassifier__max_depth=9;, score=(train=0.711, test=0.724) total time=   0.0s
[CV 3/5; 9/10] START decisiontreeclassifier__max_depth=9........................
[CV 3/5; 9/10] END decisiontreeclassifier__max_depth=9;

In [20]:
cols = ['', 'Train Accuracy', 'Val Accuracy', 'Train Recall', 'Val Recall']
hyper = pd.DataFrame(columns=cols)
hyper.loc[0] = [dt_tk_rec[0],
                np.round(dt_tk_acc[1]*100, 6),
                np.round(dt_tk_acc[2]*100, 6),
                np.round(dt_tk_rec[1]*100, 6),
                np.round(dt_tk_rec[2]*100, 6)]

hyper2 = pd.DataFrame(columns=cols)
hyper2.loc[0] = [dt_tk_rec[0],
                 '{:.2f}%'.format(np.round(dt_tk_acc[1]*100, 2)),
                 '{:.2f}%'.format(np.round(dt_tk_acc[2]*100, 2)),
                 '{:.2f}%'.format(np.round(dt_tk_rec[1]*100, 2)),
                 '{:.2f}%'.format(np.round(dt_tk_rec[2]*100, 2))]
hyper2.style.hide_index()

Unnamed: 0,Train Accuracy,Val Accuracy,Train Recall,Val Recall
DecisionTreeClassifier,67.73%,77.04%,67.68%,77.57%


### Holdout

In [None]:
del rh, y_preds, rh_holdout_score, base_var, base_model, holdout_score

In [21]:
# re-fitting for the holdout

rh = dt_tk_rec[-1]
rus = RandomUnderSampler()
X_trainval, y_trainval = rus.fit_resample(X_trainval, y_trainval)

rh.fit(X_trainval, y_trainval)
y_preds = rh.predict(X_holdout)
rh_holdout_score = recall_score(y_holdout, y_preds)

# rh = dt_tk_rec[-1]
# rh.fit(X_trainval, y_trainval)
# y_preds = rh.predict(X_holdout)
# rh_holdout_score = recall_score(y_holdout, y_preds)

In [22]:
# cross-validation
base_var = train_dt(X_trainval, y_trainval, 'recall', hypertune=True, resample=True)

# re-fitting for the holdout
base_model = base_var[-1]
base_model.fit(X_trainval, y_trainval)
y_preds = base_model.predict(X_holdout)
holdout_score = recall_score(y_holdout, y_preds)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 1/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.972, test=1.000) total time=   0.0s
[CV 2/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 2/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.981, test=0.963) total time=   0.0s
[CV 3/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 3/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.981, test=0.963) total time=   0.0s
[CV 4/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 4/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.972, test=1.000) total time=   0.0s
[CV 5/5; 1/10] START decisiontreeclassifier__max_depth=1........................
[CV 5/5; 1/10] END decisiontreeclassifier__max_depth=1;, score=(train=0.981, test=0.963) total time=  

[CV 2/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.833, test=0.741) total time=   0.0s
[CV 3/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 3/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.954, test=0.704) total time=   0.0s
[CV 4/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 4/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.954, test=0.778) total time=   0.0s
[CV 5/5; 10/10] START decisiontreeclassifier__max_depth=10......................
[CV 5/5; 10/10] END decisiontreeclassifier__max_depth=10;, score=(train=0.926, test=0.852) total time=   0.0s


In [23]:
cols = ['Method', 'Train Recall', 'Val Recall', 'Holdout Recall']
summary = pd.DataFrame(columns=cols)
summary.loc[0] = ['No Resampling',
                 '{:.2f}%'.format(np.round(base_var[1]*100, 2)),
                 '{:.2f}%'.format(np.round(base_var[2]*100, 2)),
                 '{:.2f}%'.format(np.round(holdout_score*100, 2))]
summary.loc[3] = ['Resampling x Hypertune',
                 '{:.2f}%'.format(np.round(dt_tk_rec[1]*100, 2)),
                 '{:.2f}%'.format(np.round(dt_tk_rec[2]*100, 2)),
                 '{:.2f}%'.format(np.round(rh_holdout_score*100, 2))]
summary.style.hide_index()

Method,Train Recall,Val Recall,Holdout Recall
No Resampling,75.28%,97.78%,94.37%
Resampling x Hypertune,67.68%,77.57%,94.37%


---

### Save Model

In [27]:
model = rh['decisiontreeclassifier']

In [28]:
pickle.dump(model, open(r'C:\Users\Jamie\esk\esk_sprint2\project_sprint2\data_exploration\dt_school_type.pkl', 'wb'))

----