In [1]:
#import relevant libraries
import numpy as np
import pandas as pd

In [2]:
#load dataset into pandas
data = pd.read_csv("combined_csv.csv")
df = pd.DataFrame(data)

In [3]:
# Get the list of all column names from headers
column_names = list(df.columns.values)
print(column_names)

['Number', 'Year', 'Decision Date Fiscal Year', 'Decision Month', 'Review Type', 'Decision Category', 'Decision Type Group', 'Decision Type', 'Decision Purpose', 'APR Qualifier', 'Final Decision', 'Grant Rate Qualifier', 'Elder Assisted Hearing Flag', 'Hearing Facility Region', 'Hearing Facility PB Office', 'Jurisdiction', 'Sentence Type', 'Gender', 'Race', 'Race Group', 'Major Offence Group']


Columns to be removed: 
1. APR Qualifer due to higher percentage of missing values.
2. Decision type as it and decision type group are highly correlated based on the documentation, deicision type group is the boarder variable of the two
3. Remove Race and race group is highly correlated. Based on the documentation, race group is the boarder variable of the two number is dropped as it shows the record number is not related to the data.
4. Final Decision as it is highly correlated to the dependent variable Grant Rate Qualifier. Based on the documentation, Grant Rate Qualifer is the boarder variable of the two number.

Refer to EDA for more details

In [4]:
df_EDA = df.drop(["Number","APR Qualifier", "Final Decision","Decision Type", "Race"], axis = 1)
df_EDA.head()

Unnamed: 0,Year,Decision Date Fiscal Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Grant Rate Qualifier,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000,2000-2001,Dec,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
1,2001,2000-2001,Jan,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000,2000-2001,May,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
3,2000,2000-2001,Jun,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
4,2000,2000-2001,Aug,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,PRA,EDMONTON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX


#feature selection using RFE from SKLearn
#RFE is suitable as this analysis will be making use classification methods

In [5]:
# check for non-numeric data types
df_EDA.select_dtypes("object").head()

Unnamed: 0,Decision Date Fiscal Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Grant Rate Qualifier,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000-2001,Dec,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
1,2000-2001,Jan,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000-2001,May,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
3,2000-2001,Jun,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
4,2000-2001,Aug,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,PRA,EDMONTON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX


In [6]:
#since of all the variables are non-numeric and RFE uses numeric variables
#the variables need to be converted.

# Convert "Year" &  to integer
df_EDA['Year'] = df_EDA['Year'].astype(int)
print (df_EDA.dtypes)

Year                             int32
Decision Date Fiscal Year       object
Decision Month                  object
Review Type                     object
Decision Category               object
Decision Type Group             object
Decision Purpose                object
Grant Rate Qualifier            object
Elder Assisted Hearing Flag    float64
Hearing Facility Region         object
Hearing Facility PB Office      object
Jurisdiction                    object
Sentence Type                   object
Gender                          object
Race Group                      object
Major Offence Group             object
dtype: object


In [7]:
from sklearn.preprocessing import LabelEncoder
LbEn = LabelEncoder()
df_EDA["Year"] = LbEn.fit_transform(df_EDA["Year"])
df_EDA["Decision Month"] = LbEn.fit_transform(df_EDA["Decision Month"])
df_EDA["Review Type"] = LbEn.fit_transform(df_EDA["Review Type"])
df_EDA["Decision Category"] = LbEn.fit_transform(df_EDA["Decision Category"])
df_EDA["Decision Type Group"] = LbEn.fit_transform(df_EDA["Decision Type Group"])
df_EDA["Decision Purpose"] = LbEn.fit_transform(df_EDA["Decision Purpose"])
df_EDA["Elder Assisted Hearing Flag"] = LbEn.fit_transform(df_EDA["Elder Assisted Hearing Flag"])
df_EDA["Hearing Facility Region"] = LbEn.fit_transform(df_EDA["Hearing Facility Region"])
df_EDA["Hearing Facility PB Office"] = LbEn.fit_transform(df_EDA["Hearing Facility PB Office"])
df_EDA["Jurisdiction"] = LbEn.fit_transform(df_EDA["Jurisdiction"])
df_EDA["Sentence Type"] = LbEn.fit_transform(df_EDA["Sentence Type"])
df_EDA["Gender"] = LbEn.fit_transform(df_EDA["Gender"])
df_EDA["Race Group"] = LbEn.fit_transform(df_EDA["Race Group"])
df_EDA["Major Offence Group"] = LbEn.fit_transform(df_EDA["Major Offence Group"])

In [8]:
#check if types have been converted
df_EDA.dtypes

Year                            int64
Decision Date Fiscal Year       int32
Decision Month                  int32
Review Type                     int32
Decision Category               int32
Decision Type Group             int32
Decision Purpose                int32
Grant Rate Qualifier           object
Elder Assisted Hearing Flag     int64
Hearing Facility Region         int32
Hearing Facility PB Office      int32
Jurisdiction                    int32
Sentence Type                   int32
Gender                          int32
Race Group                      int32
Major Offence Group             int32
dtype: object

In [9]:
df_EDA.select_dtypes("object").columns

Index(['Grant Rate Qualifier'], dtype='object')

In [10]:
#isolate dependent variable, which is Grant Rate Qualifer
X = df_EDA.drop(["Grant Rate Qualifier"], axis=1)
X

Unnamed: 0,Year,Decision Date Fiscal Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,0,0,5,1,0,2,2,0,1,1,0,0,1,1,3
1,1,0,11,1,0,2,2,0,1,1,0,0,1,2,3
2,0,0,17,1,0,2,2,0,0,1,0,0,1,2,3
3,0,0,13,1,0,2,2,0,0,1,0,0,1,3,3
4,0,0,3,1,0,2,2,0,3,0,0,0,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176371,17,17,18,1,0,3,2,1,3,0,0,0,1,0,3
176372,17,17,18,6,0,3,2,0,0,1,0,0,1,3,3
176373,17,17,1,6,0,3,2,0,3,2,0,0,1,3,4
176374,17,17,20,6,0,3,2,0,3,0,0,0,0,0,3


In [11]:
Y = df_EDA["Grant Rate Qualifier"]
print(df_EDA["Grant Rate Qualifier"].value_counts())

Granted/Directed/Continued    103270
Denied/Not Directed            73106
Name: Grant Rate Qualifier, dtype: int64


In [12]:
#base on the EDA, we know the "Grant Rate Qualifer" is unbalanced.
from imblearn.combine import SMOTEENN
smtn = SMOTEENN(random_state = None)

In [13]:
# Training the model
smtn.fit(X,Y)

SMOTEENN()

In [14]:
# Making samples
X, Y = smtn.fit_resample(X,Y)
Y.value_counts()

Granted/Directed/Continued    61553
Denied/Not Directed           53418
Name: Grant Rate Qualifier, dtype: int64

In [15]:
#split data into training and test sets
from sklearn.model_selection import train_test_split
XTrain, XTest, YTrain, YTest = train_test_split(X,Y, test_size=0.2, random_state=0)

In [16]:
#Implementing RFE algorithm
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
rfecv = RFECV(estimator= model, step = 1, cv = 5, scoring="accuracy")
rfecv = rfecv.fit(XTrain, YTrain)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", XTrain.columns[rfecv.support_])

The optimal number of features: 13
Best features: Index(['Year', 'Decision Date Fiscal Year', 'Decision Month', 'Review Type',
       'Decision Type Group', 'Decision Purpose',
       'Elder Assisted Hearing Flag', 'Hearing Facility Region',
       'Hearing Facility PB Office', 'Jurisdiction', 'Sentence Type',
       'Race Group', 'Major Offence Group'],
      dtype='object')
