In [60]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
#import warnings
#warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

For this lab, you will be using the .CSV files provided in the files_for_lab folder. These are cleaned versions of the learningSet data from the Case Study 'Healthcare for All'.
Begin a new Jupyter Notebook after Forking and Cloning this Repo.

DATAFRAME

In [95]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

In [96]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1) 

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(object)

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) # needed to avoid error
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# Note: we need to do train/test split before downsampling, and then only downsample the training set - Why?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [97]:
from sklearn.utils import resample

In [98]:
# for upsampling we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)
# quicker way to upsample category 1:

category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]
category_1 = resample(category_1,replace=True, n_samples = len(category_0))
print(category_0.shape)
print(category_1.shape)

trainset_new = pd.concat([category_0, category_1], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
# #data = data.reset_index(drop=True)
print(X_train.shape)

(72486, 356)
(72486, 356)
(144972, 355)


In [99]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train, y_train)
all_feat_train_score = clf.score(X_train, y_train)
print(all_feat_train_score)
all_feat_test_score = clf.score(X_test, y_test)
print(all_feat_test_score)

y_pred = clf.predict(X_test)
display(y_test.value_counts())
all_confusion_matrix = confusion_matrix(y_test, y_pred)
display(all_confusion_matrix)

0.6234307314515907
0.5979143740502018


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10853,  7230],
       [  443,   557]])

Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

FIRST! ENCONDE AND MIN-MAX

In [None]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(object)

#ONE ENCODE CATEGORICALS
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out()) # needed to avoid error
X = pd.concat([numericalX, encoded_categorical], axis = 1)

#MIN-MAX NUMERICAL NORMALIZED WITH ENCODED_CATEGORICAL, NOW ARE DATAFRAME IT'S READY TO TAKE A RFE.
from sklearn.preprocessing import MinMaxScaler
transformer= MinMaxScaler().fit(numericalX)
numericalX = pd.DataFrame(transformer.transform(numericalX), columns=numericalX.columns)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

#TO MAKE IT 100% RIGHT IT SHOULD HAVE BEEN SPLIT INTO TRAIN AND TEST, FIT THE MODEL WITH TRAIN AND THEN TEST, BUT THAT WAS DONE YESTERDAY, JUST TRYING NEW FEATURE SELECTION WITHOUT GOING CRAZY


Recrusive Feature Elimination

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
X.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_D,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,STATE_other,HOMEOWNR_U,GENDER_M,GENDER_other,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.608247,0.666667,1.0,0.0,0.0,0.393939,0.343434,0.181818,0.10101,0.020202,0.011494,0.555556,0.010051,0.011108,0.009378,0.0,0.353535,0.656566,0.474747,0.535354,0.929293,0.010101,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113402,0.0,0.0,0.0,0.464286,0.571429,0.607143,0.47619,0.595238,0.642857,0.333333,0.313131,0.424242,0.272727,0.111111,0.141414,0.181818,0.171717,0.131313,0.111111,0.151515,0.121212,0.111111,0.343434,0.252525,0.181818,0.262626,0.10101,0.232323,0.181818,0.333333,0.494949,0.282828,0.121212,0.040404,0.616162,0.070707,0.164384,0.191919,0.304615,0.394286,0.979798,0.959596,0.020202,0.020202,0.0,0.0,0.070707,0.070707,0.0,0.079833,0.105833,0.230769,0.153846,0.868687,0.141414,0.969697,0.040404,0.070707,0.383838,0.808081,0.707071,0.323232,0.848485,0.161616,0.060606,0.04,0.050505,0.090909,0.151515,0.030303,0.226667,0.505051,0.252525,0.0,0.0,0.0,0.020202,0.070707,0.131313,0.272727,0.474747,0.0,0.010101,0.616162,0.682353,0.677778,0.245902,0.1,0.020202,0.0,0.0,0.141414,0.010101,0.0,0.0,0.020202,0.050505,0.171717,0.737374,0.0,0.271889,0.77412,0.204667,0.212,0.232667,0.252,0.073818,0.131313,0.232323,0.232323,0.232323,0.151515,0.010101,0.0,0.0,0.010101,0.040404,0.252525,0.242424,0.262626,0.171717,0.020202,0.0,0.0,0.020202,0.282828,0.040404,0.515152,0.010101,0.464646,0.545455,0.030303,0.888889,0.080808,0.0,0.0,0.0,0.0,0.0,0.0,0.040404,0.010101,0.131313,0.155556,0.210526,0.020202,0.454545,0.565657,0.646465,0.505051,0.646465,0.444444,0.626263,0.535354,1.0,0.0,0.0,0.090909,0.030303,0.080808,0.131313,0.090909,0.0,0.054545,0.090909,0.030303,0.151515,0.191919,0.050505,0.040404,0.030303,0.0,0.030303,0.414141,0.010101,0.0,0.070707,0.131313,0.060606,0.050505,0.0,0.059701,0.090909,0.040404,0.010101,0.030303,0.10101,0.020202,0.010101,0.070707,0.787879,0.020202,0.0,0.705882,0.161616,0.10101,0.393939,0.212121,0.216216,0.040404,0.030303,0.051546,0.20202,0.1,0.263889,0.040404,0.0,0.0,0.0,0.181818,0.393939,0.0,0.343434,0.232323,0.181818,0.161616,0.012048,0.040404,0.0,0.25,0.0,0.0,0.050505,0.018182,0.0,0.0,0.0,0.0,0.0,0.074074,0.0,0.030303,0.747475,0.888889,0.080808,0.0,0.040404,0.969697,0.777778,0.191919,0.419355,0.596154,0.050505,0.141414,0.141414,0.313131,0.545455,0.464646,0.0,0.0,0.909091,0.0,0.10101,0.0,0.0,0.0,0.333333,0.656566,0.40404,1.0,1.0,0.285714,0.4,0.10101,0.070707,0.433333,0.366492,0.315789,0.168831,0.023965,0.127119,0.341463,0.005,0.001401,0.01,0.003676,0.006465,0.498045,0.0,1.0,0.622951,0.673077,1.0,0.333333,0.428571,0.0,0.381443,1.0,0.772727,0.636364,0.863636,0.090909,0.0,1.0,0.927083,0.909091,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.4e-05,0.463918,0.833333,1.0,0.06639,0.0,0.151515,0.555556,0.111111,0.060606,0.020202,0.011494,1.0,0.036585,0.039552,0.02819,1.0,0.0,0.0,0.505051,0.505051,0.676768,0.0,0.0,0.313131,0.060606,0.181818,0.027778,0.060606,0.059701,0.304348,0.0,0.0,0.020619,0.0,0.012346,0.046512,0.404762,0.488095,0.511905,0.380952,0.5,0.535714,0.426667,0.333333,0.464646,0.212121,0.131313,0.141414,0.333333,0.232323,0.10101,0.040404,0.020202,0.111111,0.161616,0.363636,0.222222,0.151515,0.121212,0.010101,0.050505,0.040404,0.212121,0.757576,0.555556,0.232323,0.090909,0.69697,0.040404,0.041096,0.242424,0.487692,0.514286,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.911333,0.869667,0.923077,0.769231,0.969697,0.040404,0.979798,0.030303,0.090909,0.59596,0.949495,0.888889,0.555556,0.959596,0.050505,0.040404,0.02,0.030303,0.050505,0.040404,0.020202,0.24,0.444444,0.050505,0.0,0.0,0.0,0.979798,0.989899,0.989899,0.989899,1.0,0.949495,0.0,0.838384,0.894118,0.811111,0.344262,0.125,0.0,0.0,0.0,0.040404,0.0,0.0,0.0,0.919192,0.919192,0.919192,0.949495,0.478632,0.019969,0.911464,0.725333,0.730667,0.684,0.691333,0.207279,0.020202,0.060606,0.020202,0.050505,0.151515,0.141414,0.26,0.163934,0.333333,0.020202,0.050505,0.020202,0.050505,0.151515,0.141414,0.28,0.10101,0.323232,0.060606,0.020202,0.666667,0.030303,0.565657,0.444444,0.090909,0.808081,0.141414,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.020202,0.266667,0.421053,0.121212,0.717172,0.707071,0.838384,0.585859,0.818182,0.575758,0.646465,0.575758,1.0,1.0,0.0,0.222222,0.242424,0.040404,0.212121,0.131313,0.046512,0.018182,0.060606,0.0,0.040404,0.010101,0.0,0.030303,0.010101,0.0,0.060606,0.131313,0.010101,0.03125,0.080808,0.181818,0.111111,0.040404,0.030303,0.059701,0.10101,0.070707,0.111111,0.010101,0.060606,0.020202,0.010101,0.161616,0.69697,0.050505,0.020202,0.941176,0.050505,0.050505,0.121212,0.212121,0.189189,0.30303,0.20202,0.14433,0.242424,0.133333,0.333333,0.10101,0.0,0.0,0.0,0.080808,0.151515,0.0,0.555556,0.10101,0.111111,0.0,0.0,0.020202,0.0,0.032609,0.021277,0.071429,0.020202,0.054545,0.014706,0.010101,0.0,0.057692,0.0,0.0,0.0,0.424242,0.393939,0.505051,0.070707,0.272727,0.161616,1.0,0.929293,0.535354,0.16129,0.192308,0.020202,0.262626,0.565657,0.979798,1.0,0.0,0.0,0.0,0.969697,0.0,0.040404,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.952381,0.8,0.060606,0.050505,0.183333,0.146597,0.315789,0.155844,0.00359,0.008475,0.02439,0.01,0.004004,0.025,0.016544,0.014399,0.77451,0.0,0.333333,0.0,0.25,1.0,0.0,0.785714,0.0,0.536082,0.090909,0.818182,0.818182,0.909091,1.0,0.0,1.0,0.96875,0.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.4e-05,0.624862,0.333333,0.111111,0.008299,0.0,0.20202,0.292929,0.333333,0.060606,0.080808,0.011494,0.111111,0.070931,0.085837,0.075389,0.0,0.020202,0.989899,0.494949,0.515152,0.969697,0.020202,0.0,0.0,0.020202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020619,0.0,0.0,0.0,0.416667,0.511905,0.547619,0.440476,0.535714,0.583333,0.306667,0.353535,0.40404,0.252525,0.131313,0.20202,0.191919,0.161616,0.131313,0.10101,0.080808,0.151515,0.141414,0.30303,0.222222,0.191919,0.252525,0.10101,0.232323,0.212121,0.353535,0.444444,0.222222,0.060606,0.020202,0.636364,0.090909,0.123288,0.191919,0.281538,0.362857,0.69697,0.69697,0.010101,0.060606,0.050505,0.030303,0.030303,0.030303,0.0,0.082833,0.091,0.153846,0.076923,0.787879,0.222222,0.939394,0.070707,0.181818,0.363636,0.767677,0.656566,0.30303,0.868687,0.141414,0.070707,0.04,0.050505,0.111111,0.171717,0.030303,0.226667,0.606061,0.181818,0.0,0.010101,0.0,0.0,0.010101,0.060606,0.181818,0.505051,0.0,0.040404,0.363636,0.576471,0.566667,0.229508,0.125,0.040404,0.020202,0.242424,0.111111,0.020202,0.030303,0.060606,0.0,0.020202,0.090909,0.444444,0.0,0.431644,0.587968,0.167333,0.194667,0.194667,0.226667,0.066329,0.323232,0.181818,0.20202,0.151515,0.121212,0.020202,0.0,0.0,0.010101,0.20202,0.191919,0.242424,0.181818,0.161616,0.020202,0.0,0.0,0.010101,0.282828,0.080808,0.313131,0.111111,0.383838,0.626263,0.080808,0.747475,0.222222,0.0,0.0,0.0,0.0,0.0,0.020202,0.020202,0.010101,0.212121,0.211111,0.315789,0.060606,0.616162,0.656566,0.737374,0.59596,0.707071,0.565657,0.787879,0.626263,0.828283,1.0,0.040404,0.10101,0.050505,0.020202,0.060606,0.121212,0.0,0.018182,0.090909,0.050505,0.181818,0.20202,0.050505,0.070707,0.060606,0.0,0.111111,0.333333,0.040404,0.046875,0.020202,0.121212,0.030303,0.030303,0.020202,0.0,0.070707,0.080808,0.030303,0.030303,0.060606,0.070707,0.010101,0.080808,0.747475,0.030303,0.010101,0.705882,0.222222,0.20202,0.282828,0.161616,0.162162,0.050505,0.030303,0.010309,0.232323,0.033333,0.222222,0.060606,0.0,0.0,0.0,0.10101,0.212121,0.0,0.282828,0.232323,0.323232,0.080808,0.012048,0.141414,0.032258,0.054348,0.0,0.0,0.070707,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.020202,0.848485,0.969697,0.030303,0.0,0.0,0.929293,0.656566,0.292929,0.290323,0.423077,0.030303,0.121212,0.232323,0.505051,0.69697,0.313131,0.0,0.0,0.0,0.060606,0.353535,0.444444,0.0,0.151515,0.222222,0.777778,0.171717,0.979798,0.929293,0.428571,0.4,0.060606,0.050505,0.416667,0.308901,0.315789,0.168831,0.019954,0.110169,0.341463,0.002,0.002202,0.005,0.011029,0.006204,0.078617,1.0,1.0,0.967213,0.807692,1.0,0.333333,0.5,0.0,0.0,0.090909,0.727273,0.909091,0.772727,0.545455,0.0,1.0,0.9375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.71134,0.0,0.444444,0.008299,0.0,0.232323,0.141414,0.313131,0.030303,0.0,0.034483,0.0,0.006484,0.006732,0.006186,0.0,0.080808,0.929293,0.545455,0.464646,0.616162,0.0,0.0,0.111111,0.323232,0.272727,0.027778,0.0,0.0,0.0,0.0,0.0,0.319588,0.0,0.0,0.011628,0.380952,0.47619,0.52381,0.404762,0.511905,0.559524,0.333333,0.454545,0.353535,0.20202,0.151515,0.252525,0.171717,0.171717,0.121212,0.070707,0.070707,0.20202,0.171717,0.30303,0.141414,0.191919,0.252525,0.111111,0.232323,0.232323,0.272727,0.505051,0.30303,0.151515,0.080808,0.636364,0.090909,0.082192,0.232323,0.306154,0.404286,0.858586,0.838384,0.030303,0.040404,0.010101,0.0,0.020202,0.0,0.020202,0.166667,0.2105,0.153846,0.076923,0.484848,0.525253,0.939394,0.070707,0.060606,0.363636,0.737374,0.616162,0.30303,0.848485,0.161616,0.060606,0.06,0.030303,0.212121,0.121212,0.040404,0.173333,0.363636,0.131313,0.0,0.0,0.0,0.10101,0.252525,0.505051,0.69697,0.929293,0.10101,0.151515,0.424242,0.647059,0.555556,0.245902,0.125,0.040404,0.0,0.090909,0.424242,0.040404,0.0,0.050505,0.010101,0.080808,0.171717,0.343434,0.997863,0.102919,0.978434,0.257333,0.258667,0.264,0.282,0.086693,0.272727,0.121212,0.040404,0.262626,0.222222,0.050505,0.0,0.0,0.040404,0.353535,0.050505,0.060606,0.121212,0.30303,0.060606,0.0,0.0,0.050505,0.222222,0.141414,0.262626,0.20202,0.464646,0.545455,0.030303,0.585859,0.363636,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.171717,0.144444,0.197368,0.0,0.434343,0.69697,0.818182,0.535354,0.686869,0.454545,0.333333,0.313131,0.0,1.0,0.232323,0.171717,0.030303,0.0,0.060606,0.060606,0.0,0.0,0.131313,0.424242,0.121212,0.0,0.0,0.0,0.424242,0.0,0.060606,0.030303,0.0,0.0,0.0,0.232323,0.030303,0.030303,0.060606,0.0,0.030303,0.030303,0.030303,0.030303,0.030303,0.0,0.030303,0.060606,0.878788,0.0,0.0,0.705882,0.282828,0.121212,0.141414,0.272727,0.27027,0.030303,0.050505,0.0,0.191919,0.033333,0.236111,0.0,0.0,0.0,0.0,0.131313,0.232323,0.0,0.141414,0.40404,0.313131,0.161616,0.0,0.010101,0.0,0.141304,0.0,0.0,0.040404,0.0,0.0,0.0,0.069767,0.0,0.0,0.0,0.0,0.292929,0.676768,0.565657,0.414141,0.030303,0.0,0.949495,0.434343,0.272727,0.129032,0.730769,0.0,0.10101,0.191919,0.393939,0.454545,0.555556,0.0,0.0,0.454545,0.222222,0.171717,0.0,0.0,0.161616,0.232323,0.777778,0.222222,0.939394,0.89899,0.761905,0.4,0.060606,0.060606,0.433333,0.324607,0.315789,0.168831,0.010135,0.063559,0.170732,0.002,0.001201,0.01,0.008272,0.005534,0.899764,1.0,1.0,0.655738,0.826923,1.0,0.333333,0.285714,0.0,0.28866,0.0,0.545455,0.909091,0.863636,0.909091,0.0,1.0,0.90625,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.262626,0.030303,0.022989,1.0,0.025532,0.026382,0.021495,1.0,0.0,0.0,0.464646,0.545455,0.020202,0.989899,0.0,0.0,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.392857,0.535714,0.595238,0.428571,0.547619,0.595238,0.36,0.343434,0.434343,0.232323,0.141414,0.212121,0.131313,0.151515,0.20202,0.121212,0.050505,0.131313,0.151515,0.343434,0.191919,0.191919,0.313131,0.070707,0.272727,0.161616,0.262626,0.575758,0.363636,0.242424,0.141414,0.424242,0.171717,0.123288,0.333333,0.361538,0.461429,1.0,0.989899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096,0.099,0.307692,0.230769,0.909091,0.10101,0.979798,0.030303,0.0,0.424242,0.828283,0.494949,0.222222,0.929293,0.080808,0.20202,0.06,0.171717,0.090909,0.232323,0.010101,0.013333,0.010101,0.0,0.381818,0.585859,0.191919,0.0,0.010101,0.020202,0.161616,0.676768,0.0,0.020202,0.454545,0.611765,0.588889,0.262295,0.15,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.252525,0.585859,0.747475,0.838384,0.534188,0.195084,0.599319,0.16,0.166667,0.195333,0.214,0.056359,0.242424,0.292929,0.232323,0.131313,0.040404,0.040404,0.0,0.0,0.020202,0.212121,0.30303,0.222222,0.161616,0.040404,0.050505,0.0,0.0,0.030303,0.353535,0.080808,0.111111,0.141414,0.20202,0.808081,0.040404,0.737374,0.222222,0.010101,0.010101,0.0,0.0,0.0,0.030303,0.010101,0.020202,0.010101,0.266667,0.355263,0.030303,0.767677,0.616162,0.737374,0.515152,0.656566,0.494949,0.808081,0.313131,0.818182,1.0,0.10101,0.171717,0.080808,0.020202,0.060606,0.151515,0.069767,0.127273,0.222222,0.020202,0.090909,0.0,0.070707,0.020202,0.020202,0.0,0.060606,0.010101,0.050505,0.03125,0.020202,0.121212,0.020202,0.070707,0.060606,0.059701,0.151515,0.292929,0.040404,0.030303,0.262626,0.030303,0.020202,0.070707,0.494949,0.121212,0.010101,0.705882,0.161616,0.20202,0.30303,0.131313,0.081081,0.121212,0.050505,0.020619,0.262626,0.033333,0.277778,0.070707,0.010309,0.010101,0.012821,0.151515,0.282828,0.133333,0.090909,0.161616,0.535354,0.20202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010101,0.656566,1.0,0.0,0.0,0.0,0.909091,0.454545,0.181818,0.806452,0.653846,0.0,0.010101,0.030303,0.060606,0.333333,0.676768,0.0,0.0,0.090909,0.141414,0.727273,0.030303,0.0,0.0,1.0,0.010101,0.212121,1.0,0.969697,0.285714,0.4,0.070707,0.111111,0.7,0.570681,0.526316,0.311688,0.025443,0.152542,0.195122,0.003,0.002002,0.015,0.012868,0.005586,0.037079,1.0,0.333333,0.409836,0.288462,1.0,0.333333,0.214286,0.0,0.206186,0.0,0.818182,0.818182,0.954545,0.0,0.5,0.0,0.822917,0.181818,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET_B, dtype: int64

In [None]:
num_features_to_select = 10
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=num_features_to_select)

In [None]:
#rfe.fit(X, y)
#22 MINUTES 11s IN A MACBOOK AIR 8GB RAM, DON'T RUN IT

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
feature_ranking = rfe.support_
feature_ranking 

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [None]:
selected_columns = X.columns[feature_ranking]
selected_features_X = X[selected_columns]
selected_features_X

Unnamed: 0,CHILC2,HHD9,LASTGIFT,AVGGIFT,RFA_2F,ODATEW_YR,ODATEW_MM,MINRDATE_YR,TARGET_D,RFA_2A_G
0,0.111111,0.050505,0.010,0.006465,1.000000,0.428571,0.0,0.772727,0.00,0.0
1,0.161616,0.030303,0.025,0.014399,0.333333,0.785714,0.0,0.818182,0.00,1.0
2,0.141414,0.050505,0.005,0.006204,1.000000,0.500000,0.0,0.727273,0.00,0.0
3,0.171717,0.030303,0.010,0.005534,1.000000,0.285714,0.0,0.545455,0.00,0.0
4,0.151515,0.171717,0.015,0.005586,0.333333,0.214286,0.0,0.818182,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...
95407,0.181818,0.101010,0.025,0.023745,0.000000,0.928571,0.0,0.954545,0.00,1.0
95408,0.161616,0.010101,0.020,0.018738,0.000000,0.928571,0.0,0.954545,0.00,0.0
95409,0.171717,0.141414,0.010,0.007009,0.666667,0.857143,0.0,0.954545,0.00,0.0
95410,0.161616,0.060606,0.018,0.010875,1.000000,0.214286,0.0,0.681818,0.09,0.0


Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

In [101]:
y1 = data['TARGET_B']
X1 = selected_features_X

In [102]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

In [103]:
# for upsampling we need to temporarily concat X_train and y_train
trainset1 = pd.concat([X1_train, y1_train], axis=1)
# quicker way to upsample category 1:

category_0_1 = trainset1[trainset1['TARGET_B'] == 0]
category_1_1 = trainset1[trainset1['TARGET_B'] == 1]
category_1_1 = resample(category_1_1,replace=True, n_samples = len(category_0_1))
print(category_0_1.shape)
print(category_1_1.shape)

trainset_new_1 = pd.concat([category_0_1, category_1_1], axis = 0)
trainset_new_1 = trainset_new_1.sample(frac =1) #randomize the rows
X1_train = trainset_new_1.drop(['TARGET_B'], axis=1)
y1_train = trainset_new_1['TARGET_B']
# #data = data.reset_index(drop=True)
print(X1_train.shape)

(72486, 11)
(72486, 11)
(144972, 10)


In [104]:
X1_train = pd.DataFrame(X1_train)
X1_test = pd.DataFrame(X1_test)

y1_train_regression = X1_train['TARGET_D']
y1_test_regression = X1_test['TARGET_D']

# Now we can remove the column target d from the set of features
X1_train = X1_train.drop(['TARGET_D'], axis = 1)
X1_test = X1_test.drop(['TARGET_D'], axis = 1)

In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf1 = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf1.fit(X1_train, y1_train)
selected_train_score = clf1.score(X1_train, y1_train)
print(selected_train_score)
selected_test_score = clf1.score(X1_test, y1_test)
print(selected_test_score)

y1_pred = clf1.predict(X1_test)
selected_confusion_matrix = confusion_matrix(y1_test, y1_pred)
display(y1_test.value_counts())
display(selected_confusion_matrix)

0.5927489446237895
0.6168841377141959


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[11249,  6834],
       [  477,   523]])

DISCUSSION COMPARING RESULTS: UPSAMPLING BOTH, MAKING AN RFE ON THE SECOND ONE

In [107]:
all_feat_train_score = clf.score(X_train, y_train)
print(all_feat_train_score)
all_feat_test_score = clf.score(X_test, y_test)
print(all_feat_test_score)

display(y_test.value_counts())
all_confusion_matrix = confusion_matrix(y_test, y_pred)
display(all_confusion_matrix)

0.6234307314515907
0.5979143740502018


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10853,  7230],
       [  443,   557]])

In [111]:
selected_train_score = clf1.score(X1_train, y1_train)
print(selected_train_score)
selected_test_score = clf1.score(X1_test, y1_test)
print(selected_test_score)

selected_confusion_matrix = confusion_matrix(y1_test, y1_pred)
display(y1_test.value_counts())
display(selected_confusion_matrix)

0.5927489446237895
0.6168841377141959


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[11249,  6834],
       [  477,   523]])

We get a similar explanatory power (0.6), in the selected feautures we are better in predicting who had a 1 (who donated), and has a lower error when predicting who hasn't donated. With all the columns, we have less errors but we get also a lower number when predicting donors (1/1)