# Data Load & Initial Dimensionalty Reduction

In [1]:
#import relevant libraries
import numpy as np
import pandas as pd

In [2]:
#load dataset into pandas
data = pd.read_csv("combined_csv.csv")
df = pd.DataFrame(data)

In [3]:
# Get the list of all column names from headers
column_names = list(df.columns.values)
print(column_names)

['Number', 'Year', 'Decision Date Fiscal Year', 'Decision Month', 'Review Type', 'Decision Category', 'Decision Type Group', 'Decision Type', 'Decision Purpose', 'APR Qualifier', 'Final Decision', 'Grant Rate Qualifier', 'Elder Assisted Hearing Flag', 'Hearing Facility Region', 'Hearing Facility PB Office', 'Jurisdiction', 'Sentence Type', 'Gender', 'Race', 'Race Group', 'Major Offence Group']


Columns to be removed: 
1. APR Qualifer due to higher percentage of missing values.
2. Decision type as it and decision type group are highly correlated based on the documentation, deicision type group is the boarder variable of the two
3. Remove Race and race group is highly correlated. Based on the documentation, race group is the boarder variable of the two number is dropped as it shows the record number is not related to the data.
4. Final Decision as it is highly correlated to the dependent variable Grant Rate Qualifier. Based on the documentation, Grant Rate Qualifer is the boarder variable of the two number.
5. Decision Date Fiscal Year as it is the same as Year.

Refer to EDA for more details

In [4]:
df_EDA = df.drop(["Number","APR Qualifier", "Final Decision","Decision Type", "Race","Decision Date Fiscal Year"], axis = 1)
df_EDA.head()

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Grant Rate Qualifier,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000,Dec,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
1,2001,Jan,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000,May,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
3,2000,Jun,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
4,2000,Aug,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,0.0,PRA,EDMONTON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX


# Balancing the data

In [5]:
#since of all the variables are non-numeric and to balance the data, it needs to be encoded.
#the variables need to be converted.

In [6]:
# check for non-numeric data types
df_EDA.select_dtypes("object").head()

Unnamed: 0,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Grant Rate Qualifier,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,Dec,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
1,Jan,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ONT,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,May,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
3,Jun,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
4,Aug,PANEL,PRE,FULL PAROLE,REGULAR,Granted/Directed/Continued,PRA,EDMONTON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX


In [7]:
from sklearn.preprocessing import LabelEncoder
LbEn = LabelEncoder()
df_EDA["Year"] = LbEn.fit_transform(df_EDA["Year"])
df_EDA["Decision Month"] = LbEn.fit_transform(df_EDA["Decision Month"])
df_EDA["Review Type"] = LbEn.fit_transform(df_EDA["Review Type"])
df_EDA["Decision Category"] = LbEn.fit_transform(df_EDA["Decision Category"])
df_EDA["Decision Type Group"] = LbEn.fit_transform(df_EDA["Decision Type Group"])
df_EDA["Decision Purpose"] = LbEn.fit_transform(df_EDA["Decision Purpose"])
df_EDA["Elder Assisted Hearing Flag"] = LbEn.fit_transform(df_EDA["Elder Assisted Hearing Flag"])
df_EDA["Hearing Facility Region"] = LbEn.fit_transform(df_EDA["Hearing Facility Region"])
df_EDA["Hearing Facility PB Office"] = LbEn.fit_transform(df_EDA["Hearing Facility PB Office"])
df_EDA["Jurisdiction"] = LbEn.fit_transform(df_EDA["Jurisdiction"])
df_EDA["Sentence Type"] = LbEn.fit_transform(df_EDA["Sentence Type"])
df_EDA["Gender"] = LbEn.fit_transform(df_EDA["Gender"])
df_EDA["Race Group"] = LbEn.fit_transform(df_EDA["Race Group"])
df_EDA["Major Offence Group"] = LbEn.fit_transform(df_EDA["Major Offence Group"])

In [8]:
#check if types have been converted
df_EDA.dtypes

Year                            int64
Decision Month                  int32
Review Type                     int32
Decision Category               int32
Decision Type Group             int32
Decision Purpose                int32
Grant Rate Qualifier           object
Elder Assisted Hearing Flag     int64
Hearing Facility Region         int32
Hearing Facility PB Office      int32
Jurisdiction                    int32
Sentence Type                   int32
Gender                          int32
Race Group                      int32
Major Offence Group             int32
dtype: object

In [9]:
df_EDA.select_dtypes("object").columns

Index(['Grant Rate Qualifier'], dtype='object')

In [10]:
#base on the EDA, we know the "Grant Rate Qualifer" is unbalanced.
from imblearn.combine import SMOTEENN
smtn = SMOTEENN(random_state = 1)

In [11]:
#isolate dependent variable, which is Grant Rate Qualifer
X = df_EDA.drop(["Grant Rate Qualifier"], axis=1)
X

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,0,5,1,0,2,2,0,1,1,0,0,1,1,3
1,1,11,1,0,2,2,0,1,1,0,0,1,2,3
2,0,17,1,0,2,2,0,0,1,0,0,1,2,3
3,0,13,1,0,2,2,0,0,1,0,0,1,3,3
4,0,3,1,0,2,2,0,3,0,0,0,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176371,17,18,1,0,3,2,1,3,0,0,0,1,0,3
176372,17,18,6,0,3,2,0,0,1,0,0,1,3,3
176373,17,1,6,0,3,2,0,3,2,0,0,1,3,4
176374,17,20,6,0,3,2,0,3,0,0,0,0,0,3


In [12]:
Y = df_EDA["Grant Rate Qualifier"]
print(df_EDA["Grant Rate Qualifier"].value_counts())

Granted/Directed/Continued    103270
Denied/Not Directed            73106
Name: Grant Rate Qualifier, dtype: int64


In [13]:
# Training the model
smtn.fit(X,Y)

SMOTEENN(random_state=1)

In [14]:
# Making samples
X, Y = smtn.fit_resample(X,Y)
Y.value_counts()
X

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,0,17,1,0,2,2,0,2,1,0,0,1,1,2
1,0,2,1,0,2,2,0,2,1,0,0,1,2,3
2,0,2,1,0,2,2,0,4,1,0,0,1,2,2
3,0,19,1,0,2,2,0,4,1,0,0,1,4,4
4,0,2,1,0,2,2,0,4,1,0,0,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116036,17,15,6,0,3,2,0,3,2,0,0,1,3,4
116037,17,1,6,0,3,2,0,4,1,0,1,1,0,1
116038,17,18,6,0,3,2,0,0,1,0,0,1,3,3
116039,17,20,6,0,3,2,0,3,0,0,0,0,0,3


In [15]:
Y

0                Denied/Not Directed
1                Denied/Not Directed
2                Denied/Not Directed
3                Denied/Not Directed
4                Denied/Not Directed
                     ...            
116036    Granted/Directed/Continued
116037    Granted/Directed/Continued
116038    Granted/Directed/Continued
116039    Granted/Directed/Continued
116040    Granted/Directed/Continued
Name: Grant Rate Qualifier, Length: 116041, dtype: object

# Feature Selection - RFE (wrapper method - backwards elimination)

In [16]:
#Implementing RFE algorithm <- fixed with whole dataset; wrapper - backwards elimination
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
rfecv = RFECV(estimator= model, step = 1, cv = 5, scoring="accuracy")
rfecv = rfecv.fit(X, Y)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X.columns[rfecv.support_])

The optimal number of features: 3
Best features: Index(['Review Type', 'Decision Type Group', 'Decision Purpose'], dtype='object')


In [17]:
#encoded results for neural networks & save
y_enc = LbEn.fit_transform(Y)
y_enc = pd.DataFrame(y_enc)
y_enc.columns = ['Grant Rate Qualifier']
df2_enc = pd.DataFrame(X, columns = ['Review Type', 'Decision Type Group', 'Decision Purpose'])
df2_enc = pd.concat([df2_enc, y_enc], axis=1, join="inner")
df2_enc.to_csv('data_optimized_RFE_NN.csv', index=False)

In [18]:
#decode results for X so it maintains balanced dataset.
x_dec = X.copy()
for column in X.columns:
    LbEn.fit(df[column])
    x_dec[column] = LbEn.inverse_transform(X[column])
x_dec

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000,May,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,NON SCHEDULE
1,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,NON SCHEDULE
3,2000,Nov,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,OTHER/UNKNOWN,SCHE 1 WITH SEX
4,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116036,2017,MAY,PAPER,PRE,UTA,REGULAR,0.0,PRA,SASKATOON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 WITH SEX
116037,2017,AUG,PAPER,PRE,UTA,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,INDETERMINATE,MALE,ABORIGINAL,MURDER 2
116038,2017,NOV,PAPER,PRE,UTA,REGULAR,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
116039,2017,OCT,PAPER,PRE,UTA,REGULAR,0.0,PRA,EDMONTON,FEDERAL,DETERMINATE,FEMALE,ABORIGINAL,SCHE 1 W/O SEX


In [19]:
#create new dataframe with optimal features.
df2 = pd.DataFrame(x_dec, columns = ['Review Type', 'Decision Type Group', 'Decision Purpose'])
df2 = pd.concat([df2, Y], axis=1, join="inner")
df2

Unnamed: 0,Review Type,Decision Type Group,Decision Purpose,Grant Rate Qualifier
0,PANEL,FULL PAROLE,REGULAR,Denied/Not Directed
1,PANEL,FULL PAROLE,REGULAR,Denied/Not Directed
2,PANEL,FULL PAROLE,REGULAR,Denied/Not Directed
3,PANEL,FULL PAROLE,REGULAR,Denied/Not Directed
4,PANEL,FULL PAROLE,REGULAR,Denied/Not Directed
...,...,...,...,...
116036,PAPER,UTA,REGULAR,Granted/Directed/Continued
116037,PAPER,UTA,REGULAR,Granted/Directed/Continued
116038,PAPER,UTA,REGULAR,Granted/Directed/Continued
116039,PAPER,UTA,REGULAR,Granted/Directed/Continued


In [20]:
#save csv version of dataframe.
df2.to_csv('data_optimized_RFE.csv', index=False)

# Feature Selection (embedded method - lasso)

In [21]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import StratifiedKFold

In [22]:
#Lasso requires encoded Y
Y_2= LbEn.fit_transform(Y)

In [23]:
#Lasso + Stratified Cross Validation
skf = StratifiedKFold(n_splits=10)
lasso = LassoCV(cv=skf, random_state=42).fit(X, Y_2)
print('Selected Features:', list(X.columns[np.where(lasso.coef_!=0)[0]]))

Selected Features: ['Year', 'Decision Month', 'Review Type', 'Decision Type Group', 'Decision Purpose', 'Elder Assisted Hearing Flag', 'Hearing Facility Region', 'Jurisdiction', 'Sentence Type', 'Gender', 'Race Group', 'Major Offence Group']


In [24]:
#save results as a list
var = list(X.columns[np.where(lasso.coef_!=0)[0]])
var

['Year',
 'Decision Month',
 'Review Type',
 'Decision Type Group',
 'Decision Purpose',
 'Elder Assisted Hearing Flag',
 'Hearing Facility Region',
 'Jurisdiction',
 'Sentence Type',
 'Gender',
 'Race Group',
 'Major Offence Group']

In [25]:
#encoded results for neural networks & save
df3_enc = pd.DataFrame(X, columns = var)
df3_enc = pd.concat([df3_enc, y_enc], axis=1, join="inner")
df3_enc.to_csv('data_optimized_lasso_NN.csv', index=False)

In [26]:
#decode results for X so it maintains balanced dataset.
x_dec2 = X.copy()
for column in X.columns:
    LbEn.fit(df[column])
    x_dec2[column] = LbEn.inverse_transform(X[column])
x_dec2

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000,May,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,NON SCHEDULE
1,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,NON SCHEDULE
3,2000,Nov,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,OTHER/UNKNOWN,SCHE 1 WITH SEX
4,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116036,2017,MAY,PAPER,PRE,UTA,REGULAR,0.0,PRA,SASKATOON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 WITH SEX
116037,2017,AUG,PAPER,PRE,UTA,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,INDETERMINATE,MALE,ABORIGINAL,MURDER 2
116038,2017,NOV,PAPER,PRE,UTA,REGULAR,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
116039,2017,OCT,PAPER,PRE,UTA,REGULAR,0.0,PRA,EDMONTON,FEDERAL,DETERMINATE,FEMALE,ABORIGINAL,SCHE 1 W/O SEX


In [27]:
#create new dataframe with optimal features. Dropping Decision Category
df2 = pd.DataFrame(x_dec2, columns = var)
df2 = pd.concat([df2, Y], axis=1, join="inner")
df2

Unnamed: 0,Year,Decision Month,Review Type,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group,Grant Rate Qualifier
0,2000,May,PANEL,FULL PAROLE,REGULAR,0.0,PAC,FEDERAL,DETERMINATE,MALE,ASIAN,NON SCHEDULE,Denied/Not Directed
1,2000,Apr,PANEL,FULL PAROLE,REGULAR,0.0,PAC,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX,Denied/Not Directed
2,2000,Apr,PANEL,FULL PAROLE,REGULAR,0.0,QUE,FEDERAL,DETERMINATE,MALE,BLACK,NON SCHEDULE,Denied/Not Directed
3,2000,Nov,PANEL,FULL PAROLE,REGULAR,0.0,QUE,FEDERAL,DETERMINATE,MALE,OTHER/UNKNOWN,SCHE 1 WITH SEX,Denied/Not Directed
4,2000,Apr,PANEL,FULL PAROLE,REGULAR,0.0,QUE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX,Denied/Not Directed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116036,2017,MAY,PAPER,UTA,REGULAR,0.0,PRA,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 WITH SEX,Granted/Directed/Continued
116037,2017,AUG,PAPER,UTA,REGULAR,0.0,QUE,FEDERAL,INDETERMINATE,MALE,ABORIGINAL,MURDER 2,Granted/Directed/Continued
116038,2017,NOV,PAPER,UTA,REGULAR,0.0,ATL,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX,Granted/Directed/Continued
116039,2017,OCT,PAPER,UTA,REGULAR,0.0,PRA,FEDERAL,DETERMINATE,FEMALE,ABORIGINAL,SCHE 1 W/O SEX,Granted/Directed/Continued


In [28]:
#save csv version of dataframe.
df2.to_csv('data_optimized_lasso.csv', index=False)

# Feature Selection (filter method - chi-square)

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [30]:
chi2_features = SelectKBest(chi2)
X_kbest_features = chi2_features.fit_transform(X, Y)

In [31]:
print('Original feature number:', X.shape[1])
print('Reduced feature number:', X_kbest_features.shape[1])

Original feature number: 14
Reduced feature number: 10


In [32]:
#get feature names
mask = chi2_features.get_support(indices=True)
new_features = [] # The list of your K best features

for bool_val, feature in zip(mask, X):
    if bool_val:
        new_features.append(feature)

In [33]:
#encoded results for neural networks & save
df4_enc = pd.DataFrame(X, columns = new_features)
df4_enc = pd.concat([df4_enc, y_enc], axis=1, join="inner")
df4_enc.to_csv('data_optimized_chi_NN.csv', index=False)

In [34]:
#decode results for X so it maintains balanced dataset.
x_dec3 = X.copy()
for column in X.columns:
    LbEn.fit(df[column])
    x_dec3[column] = LbEn.inverse_transform(X[column])
x_dec3

Unnamed: 0,Year,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Sentence Type,Gender,Race Group,Major Offence Group
0,2000,May,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,NON SCHEDULE
1,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,SCHE 1 W/O SEX
2,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,BLACK,NON SCHEDULE
3,2000,Nov,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,OTHER/UNKNOWN,SCHE 1 WITH SEX
4,2000,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,ASIAN,SCHE 1 W/O SEX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116036,2017,MAY,PAPER,PRE,UTA,REGULAR,0.0,PRA,SASKATOON,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 WITH SEX
116037,2017,AUG,PAPER,PRE,UTA,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,INDETERMINATE,MALE,ABORIGINAL,MURDER 2
116038,2017,NOV,PAPER,PRE,UTA,REGULAR,0.0,ATL,NOT APPLICABLE,FEDERAL,DETERMINATE,MALE,CAUCASIAN,SCHE 1 W/O SEX
116039,2017,OCT,PAPER,PRE,UTA,REGULAR,0.0,PRA,EDMONTON,FEDERAL,DETERMINATE,FEMALE,ABORIGINAL,SCHE 1 W/O SEX


In [35]:
#create new dataframe with optimal features.
df3 = pd.DataFrame(x_dec3, columns = new_features)
df3 = pd.concat([df3, Y], axis=1, join="inner")
df3

Unnamed: 0,Decision Month,Review Type,Decision Category,Decision Type Group,Decision Purpose,Elder Assisted Hearing Flag,Hearing Facility Region,Hearing Facility PB Office,Jurisdiction,Grant Rate Qualifier
0,May,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,Denied/Not Directed
1,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,PAC,NOT APPLICABLE,FEDERAL,Denied/Not Directed
2,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,Denied/Not Directed
3,Nov,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,Denied/Not Directed
4,Apr,PANEL,PRE,FULL PAROLE,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,Denied/Not Directed
...,...,...,...,...,...,...,...,...,...,...
116036,MAY,PAPER,PRE,UTA,REGULAR,0.0,PRA,SASKATOON,FEDERAL,Granted/Directed/Continued
116037,AUG,PAPER,PRE,UTA,REGULAR,0.0,QUE,NOT APPLICABLE,FEDERAL,Granted/Directed/Continued
116038,NOV,PAPER,PRE,UTA,REGULAR,0.0,ATL,NOT APPLICABLE,FEDERAL,Granted/Directed/Continued
116039,OCT,PAPER,PRE,UTA,REGULAR,0.0,PRA,EDMONTON,FEDERAL,Granted/Directed/Continued


In [36]:
#save csv version of dataframe.
df3.to_csv('data_optimized_chi.csv', index=False)