In [1]:
#libreries

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold


In [2]:
#reading the data

numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

display(numerical.head(5))
display(categorical.head(5))
display(target.head(5))

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [3]:
#concat all the data in one

donors = pd.concat([numerical, categorical, target], axis=1)
donors.head(5)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


In [4]:
donors.isna().sum().sum() #we don't have nulls

0

- Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

In [5]:
X = donors.drop(columns=['TARGET_B','TARGET_D'])
y = donors['TARGET_B']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
#Check for Imbalance

print("Imbalance Check:")
print(y_train.value_counts())

Imbalance Check:
0    72469
1     3860
Name: TARGET_B, dtype: int64


In [8]:
train_data = pd.concat([X_train, y_train], axis=1)

In [9]:
class_0 = train_data[train_data['TARGET_B'] == 0]
class_1 = train_data[train_data['TARGET_B'] == 1]

In [10]:
#Now to deal with the imbalance, we are going to Oversampling the train data just in the ones that have donated. 

class_1_upsampled = resample(class_1, 
                                   replace=True,    
                                   n_samples=len(class_0))

In [11]:
upsampled_data = pd.concat([class_0, class_1_upsampled])
upsampled_data = upsampled_data.sample(frac =1)

In [12]:
upsampled_data['TARGET_B'].value_counts() #Looks better!

1    72469
0    72469
Name: TARGET_B, dtype: int64

In [13]:
upsampled_data.shape

(144938, 338)

In [14]:
X_train_upsampled = upsampled_data.drop('TARGET_B', axis=1)
y_train_upsampled = upsampled_data['TARGET_B']

In [15]:
# Checking the number of rows

print(X_train_upsampled.shape)
print(X_test.shape)
print(y_train_upsampled.shape)
print(y_test.shape)

(144938, 337)
(19083, 337)
(144938,)
(19083,)


In [16]:
#Scaling the data

In [17]:
X_train_num = X_train_upsampled.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train_upsampled.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

In [18]:
# Before to try again the model, we will use MinMaxScaler on X train num

transformer = MinMaxScaler().fit(X_train_num) 
X_train_scaled_arr = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_num.columns)
#X_train_scaled.head()
X_train_scaled.shape

(144938, 330)

In [19]:
# Before to try again the model, we will use MinMaxScaler on X test num

X_test_scaled_arr = transformer.transform(X_test_num)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
#X_test_scaled.head()
X_test_scaled.shape

(19083, 330)

In [20]:
# Before to try again the model, we will use OneHot on X_train_cat and in X_test_cat

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first')

# Fit and Transform X_train
encoded_train_cat = encoder.fit_transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
onehot_encoded_train = pd.DataFrame(encoded_train_cat, columns=cols)

# Transform X_test
#encoder.set_params(handle_unknown='ignore')
encoded_test_cat = encoder.transform(X_test_cat).toarray()
onehot_encoded_test = pd.DataFrame(encoded_test_cat, columns=cols)

In [21]:
# Dataframe from X_train after being treated

X_train_treated = pd.concat([X_train_scaled, onehot_encoded_train], axis=1)
X_train_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.814433,0.166667,0.444444,0.016598,0.020408,0.303030,0.424242,0.343434,0.111111,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.000028,0.752577,0.666667,1.000000,0.000000,0.000000,0.303030,0.262626,0.404040,0.080808,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.000014,0.701031,0.833333,1.000000,0.008299,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000028,0.257732,0.166667,1.000000,0.000000,0.000000,0.333333,0.242424,0.434343,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.000014,0.546392,0.333333,0.333333,0.020747,0.000000,0.383838,0.121212,0.313131,0.060606,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144933,0.000028,0.835052,0.666667,1.000000,0.000000,0.010204,0.323232,0.272727,0.434343,0.070707,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
144934,0.000000,0.711340,0.500000,0.333333,0.008299,0.000000,0.282828,0.282828,0.303030,0.070707,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
144935,0.000028,0.680412,0.666667,0.555556,0.029046,0.000000,0.333333,0.333333,0.292929,0.080808,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
144936,0.000014,0.711340,0.166667,1.000000,0.000000,0.000000,0.393939,0.272727,0.393939,0.070707,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# Dataframe from X_test after being treated

X_test_treated = pd.concat([X_test_scaled, onehot_encoded_test], axis=1)
X_test_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000014,0.624862,0.666667,1.000000,0.000000,0.061224,0.262626,0.484848,0.171717,0.040404,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.000014,0.546392,0.666667,1.000000,0.020747,0.020408,0.303030,0.474747,0.181818,0.030303,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.000000,0.624862,0.333333,0.555556,0.029046,0.040816,0.333333,0.222222,0.202020,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000014,0.814433,0.333333,0.111111,0.128631,0.000000,0.292929,0.121212,0.606061,0.191919,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.000028,0.731959,0.666667,0.666667,0.004149,0.000000,0.373737,0.050505,0.575758,0.040404,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.000014,0.628866,0.333333,1.000000,0.000000,0.010204,0.202020,0.343434,0.161616,0.070707,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
19079,0.000014,0.608247,0.833333,1.000000,0.000000,0.000000,0.292929,0.171717,0.525253,0.030303,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19080,0.000000,0.624862,0.000000,1.000000,0.000000,0.000000,0.252525,0.292929,0.363636,0.040404,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19081,0.389267,0.793814,0.666667,1.000000,0.000000,0.000000,0.282828,0.242424,0.303030,0.030303,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [23]:
y_train_upsampled.head()

87362    1
70996    0
515      1
9800     1
33512    1
Name: TARGET_B, dtype: int64

## Random forest 

In [24]:
#Now we are going to continue with feature selection using random forest

In [25]:
clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_treated, y_train_upsampled)
print(clf.score(X_train_treated, y_train_upsampled))
print(clf.score(X_test_treated, y_test))

0.6233078971698244
0.610386207619347


In [26]:
#The model scores 62% accuracy on the training data and a lower 68% on the testing data, what is not the best results


In [27]:
# evaluate the performance of the model by confusion matrix

y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0    18100
1      983
Name: TARGET_B, dtype: int64

array([[11097,  7003],
       [  432,   551]])

In [28]:
# We have wrong predict a big amount, so our model is not very effective at the moment

- Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [29]:
X_train_treated.reset_index(drop=True, inplace=True)
y_train_upsampled.reset_index(drop=True, inplace=True)
X_test_treated.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [30]:
data_train = pd.concat([X_train_treated, y_train_upsampled], axis=1)
data_train.head(5)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.0,0.814433,0.166667,0.444444,0.016598,0.020408,0.30303,0.424242,0.343434,0.111111,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
1,2.8e-05,0.752577,0.666667,1.0,0.0,0.0,0.30303,0.262626,0.40404,0.080808,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
2,1.4e-05,0.701031,0.833333,1.0,0.008299,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,2.8e-05,0.257732,0.166667,1.0,0.0,0.0,0.333333,0.242424,0.434343,0.060606,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,1.4e-05,0.546392,0.333333,0.333333,0.020747,0.0,0.383838,0.121212,0.313131,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [31]:
data_test = pd.concat([X_test_treated, y_test], axis=1)
data_test.head(5)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,1.4e-05,0.624862,0.666667,1.0,0.0,0.061224,0.262626,0.484848,0.171717,0.040404,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1.4e-05,0.546392,0.666667,1.0,0.020747,0.020408,0.30303,0.474747,0.181818,0.030303,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,0.0,0.624862,0.333333,0.555556,0.029046,0.040816,0.333333,0.222222,0.20202,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,1.4e-05,0.814433,0.333333,0.111111,0.128631,0.0,0.292929,0.121212,0.606061,0.191919,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
4,2.8e-05,0.731959,0.666667,0.666667,0.004149,0.0,0.373737,0.050505,0.575758,0.040404,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [32]:
data_f = pd.concat([data_train,data_test], axis=0)
data_f

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.000000,0.814433,0.166667,0.444444,0.016598,0.020408,0.303030,0.424242,0.343434,0.111111,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
1,0.000028,0.752577,0.666667,1.000000,0.000000,0.000000,0.303030,0.262626,0.404040,0.080808,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
2,0.000014,0.701031,0.833333,1.000000,0.008299,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,0.000028,0.257732,0.166667,1.000000,0.000000,0.000000,0.333333,0.242424,0.434343,0.060606,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,0.000014,0.546392,0.333333,0.333333,0.020747,0.000000,0.383838,0.121212,0.313131,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.000014,0.628866,0.333333,1.000000,0.000000,0.010204,0.202020,0.343434,0.161616,0.070707,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
19079,0.000014,0.608247,0.833333,1.000000,0.000000,0.000000,0.292929,0.171717,0.525253,0.030303,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
19080,0.000000,0.624862,0.000000,1.000000,0.000000,0.000000,0.252525,0.292929,0.363636,0.040404,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
19081,0.389267,0.793814,0.666667,1.000000,0.000000,0.000000,0.282828,0.242424,0.303030,0.030303,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0


In [33]:
data_f.isna().sum().sum() #not nulls!

0

In [34]:
X_data_f = data_f.drop('TARGET_B', axis=1)
y_data_f = data_f['TARGET_B']

In [35]:
X_data_f.shape

(164021, 354)

## Recursive Feature Elimination

In [36]:
#First we are going to try to select our features with RFE

In [37]:
warnings.filterwarnings('ignore')

logreg = LogisticRegression(max_iter=10)
rfe = RFE(logreg, n_features_to_select=20, verbose=False)
rfe.fit(X_data_f, y_data_f)

In [38]:
#rfe.ranking_

In [39]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance.

df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(X_data_f).columns
df
df[df['Rank']==1]
#df['Rank'].value_counts()

Unnamed: 0,Rank,Column_name
2,1,INCOME
21,1,ETH1
61,1,HHAGE3
64,1,HHN3
71,1,MARR4
110,1,HVP1
115,1,HVP6
135,1,DMA
266,1,POBC2
305,1,CARDGIFT


In [40]:
#we are going to select this new features and see the results in our model

selected_features = df[df['Rank']==1]
selected_features
#len(selected_features)

Unnamed: 0,Rank,Column_name
2,1,INCOME
21,1,ETH1
61,1,HHAGE3
64,1,HHN3
71,1,MARR4
110,1,HVP1
115,1,HVP6
135,1,DMA
266,1,POBC2
305,1,CARDGIFT


In [41]:
selected_features = selected_features['Column_name']
col_rfe = data_f[selected_features]
col_rfe

Unnamed: 0,INCOME,ETH1,HHAGE3,HHN3,MARR4,HVP1,HVP6,DMA,POBC2,CARDGIFT,RFA_2F,ODATEW_YR,DOB_MM,MAXRDATE_YR,LASTDATE_YR,FIRSTDATE_YR,STATE_FL,RFA_2A_F,RFA_2A_G,DOMAIN_A_U
0,0.166667,0.808081,0.272727,0.474747,0.222222,0.010101,0.000000,0.681044,0.747475,0.170732,1.000000,0.500000,0.000000,0.909091,0.5,0.937500,0.0,0.0,0.0,0.0
1,0.666667,0.959596,0.323232,0.383838,0.171717,0.010101,0.010101,0.720772,0.767677,0.195122,1.000000,0.214286,0.000000,0.772727,0.0,0.906250,0.0,0.0,0.0,0.0
2,0.833333,0.979798,0.050505,0.646465,0.121212,0.070707,0.010101,0.594779,0.494949,0.048780,0.000000,0.571429,0.000000,0.909091,0.0,0.947917,0.0,0.0,1.0,0.0
3,0.166667,1.000000,0.262626,0.232323,0.252525,0.010101,0.000000,0.691260,0.767677,0.073171,1.000000,0.857143,0.000000,0.909091,0.5,0.989583,0.0,0.0,1.0,0.0
4,0.333333,0.969697,0.252525,0.212121,0.212121,0.000000,0.000000,0.625426,0.888889,0.170732,1.000000,0.571429,0.000000,0.909091,0.0,0.947917,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.333333,0.303030,0.070707,0.696970,0.353535,0.000000,0.000000,0.978434,0.494949,0.000000,0.000000,0.928571,0.000000,0.954545,0.5,1.000000,0.0,0.0,1.0,1.0
19079,0.833333,0.959596,0.252525,0.383838,0.202020,0.989899,0.959596,0.911464,0.383838,0.024390,0.000000,0.928571,0.000000,0.954545,0.5,1.000000,0.0,1.0,0.0,0.0
19080,0.000000,0.878788,0.343434,0.373737,0.161616,0.000000,0.000000,0.786606,0.797980,0.024390,0.000000,0.928571,0.090909,0.909091,0.0,0.989583,0.0,1.0,0.0,0.0
19081,0.666667,1.000000,0.333333,0.434343,0.202020,0.000000,0.000000,0.708286,0.949495,0.170732,0.000000,0.357143,0.000000,0.954545,0.5,0.916667,0.0,1.0,0.0,0.0


- Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

## Random forest 2

In [42]:
X_col_rfe = col_rfe
y_col_rfe = data_f['TARGET_B']

In [43]:
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_col_rfe, y_col_rfe, test_size=0.2)


In [44]:
clf_rfe = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf_rfe.fit(X_train_rfe, y_train_rfe)
print(clf_rfe.score(X_train_rfe, y_train_rfe))
print(clf_rfe.score(X_test_rfe, y_test_rfe))

0.6096969881721741
0.6048773052888279


In [45]:
#It hasn't improve at all (before we had 62% accuracy on the training data and 68% on the testing data)

## VarianceThreshold

In [46]:
#So we are going to try with VarianceThreshold (it removes all the low variance features from the dataset that are of no great use in modeling)


In [47]:
X_data_f = data_f.drop('TARGET_B', axis=1)
y_data_f = data_f['TARGET_B']

In [48]:
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

In [49]:
sel = sel.fit(X_data_f)

In [50]:
sel.get_support()
var_list=list(sel.get_support())
droplist_var=[col[0] for col in zip(X_data_f.columns, var_list) if col[1] == False]
print(droplist_var)
len(droplist_var)

['TCODE', 'HIT', 'MALEMILI', 'MALEVET', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HU3', 'HU4', 'HHD1', 'HHD4', 'HHD5', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC3', 'ETHC4', 'ETHC5', 'ETHC6', 'HUR1', 'RHP1', 'RHP2', 'RHP3', 'RHP4', 'HUPA1', 'HUPA4', 'HUPA5', 'HUPA7', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'H

240

In [51]:
X_data_f_1 = X_data_f.drop(droplist_var, axis=1)
X_data_f_1

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.814433,0.166667,0.444444,0.424242,0.343434,1.000000,0.0,1.0,0.0,0.808081,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.752577,0.666667,1.000000,0.262626,0.404040,0.222222,0.0,0.0,1.0,0.959596,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.701031,0.833333,1.000000,0.000000,0.000000,1.000000,0.0,0.0,1.0,0.979798,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.257732,0.166667,1.000000,0.242424,0.434343,1.000000,1.0,0.0,0.0,1.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.546392,0.333333,0.333333,0.121212,0.313131,1.000000,1.0,0.0,0.0,0.969697,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.628866,0.333333,1.000000,0.343434,0.161616,1.000000,1.0,0.0,0.0,0.303030,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
19079,0.608247,0.833333,1.000000,0.171717,0.525253,1.000000,1.0,0.0,0.0,0.959596,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19080,0.624862,0.000000,1.000000,0.292929,0.363636,1.000000,0.0,0.0,1.0,0.878788,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19081,0.793814,0.666667,1.000000,0.242424,0.303030,0.333333,0.0,0.0,1.0,1.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [52]:
len(data_f['TARGET_B'])

164021

## Ramdon Forest classifer

In [53]:
# we are going to try again random forest and see if the results have improve

In [54]:
X_var = X_data_f_1
y_var = data_f['TARGET_B']

In [55]:
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(X_var, y_var, test_size=0.2)


In [56]:
clf_rfe = RandomForestClassifier(max_depth=10, 
                             min_samples_split=20, 
                             min_samples_leaf =20, 
                             max_samples=0.8)
clf_rfe.fit(X_train_var, y_train_var)
print(clf_rfe.score(X_train_var, y_train_var))
print(clf_rfe.score(X_test_var, y_test_var))

0.8064412876478478
0.7923487273281512


In [57]:
# Yes! It has improve (before we had 62% accuracy on the training data and 62% on the testing data)

- Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

In [58]:
predictions = clf_rfe.predict(X_test_var)
predictions

array([0, 0, 0, ..., 1, 1, 0])

In [59]:
from sklearn.metrics import confusion_matrix

In [60]:
array = confusion_matrix(y_test_var, predictions)
print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',array[0][0],'|   ', array[0][1])
print('---------------------------------')
print('           B | ',array[1][0],' |   ', array[1][1])

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  16902 |    1262
---------------------------------
           B |  5550  |    9091


In [61]:
# we still have many false positive and false negative. 

In [62]:
predictions_1 = clf_rfe.predict(X_var)
predictions_1

array([1, 1, 0, ..., 0, 0, 0])

In [63]:
len(predictions_1)

164021

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_var, predictions_1)
precision = precision_score(y_var, predictions_1)
recall = recall_score(y_var, predictions_1)
f1 = f1_score(y_var, predictions_1)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.8036227068485133
Precision: 0.8917958656330749
Recall: 0.6390159559984752
F1 score: 0.7445353891056755


In [65]:
# These are the best results after trying different feature selection methods, so we are going to select this last feature selecion for our model. 


In [66]:
X_data_f_1['PREDICTIONS']= list(predictions_1)
X_data_f_2 = pd.concat([X_data_f_1, data_f['TARGET_B']], axis=1)
X_data_f_2

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,PREDICTIONS,TARGET_B
0,0.814433,0.166667,0.444444,0.424242,0.343434,1.000000,0.0,1.0,0.0,0.808081,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,1
1,0.752577,0.666667,1.000000,0.262626,0.404040,0.222222,0.0,0.0,1.0,0.959596,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,0
2,0.701031,0.833333,1.000000,0.000000,0.000000,1.000000,0.0,0.0,1.0,0.979798,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1
3,0.257732,0.166667,1.000000,0.242424,0.434343,1.000000,1.0,0.0,0.0,1.000000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1
4,0.546392,0.333333,0.333333,0.121212,0.313131,1.000000,1.0,0.0,0.0,0.969697,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.628866,0.333333,1.000000,0.343434,0.161616,1.000000,1.0,0.0,0.0,0.303030,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
19079,0.608247,0.833333,1.000000,0.171717,0.525253,1.000000,1.0,0.0,0.0,0.959596,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
19080,0.624862,0.000000,1.000000,0.292929,0.363636,1.000000,0.0,0.0,1.0,0.878788,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0
19081,0.793814,0.666667,1.000000,0.242424,0.303030,0.333333,0.0,0.0,1.0,1.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0


In [67]:
X_data_f_2.to_csv('donors_with_targetB_predictions.csv', index=False)

# Lab | Final regression model in "Health Care for All" Case

At this point, we have created a model to predict who will make a donation and who won't (Classification Model). But, what about the ammount of money that each person will give?

In this lab, subset those that have made a donation (Target B) and use that subset to create a model to predict how much money will they give (Target D) (Regression Model).


- Only look at people who have donated (Target B = 1)

In [68]:
donors

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


In [69]:
donors_yes = donors[donors['TARGET_B'] == 1].reset_index(drop=True)

- Use this new dataframe to create a model to predict how much they will donate (Target D)

In [70]:
X = donors_yes.drop(columns=['TARGET_D'], axis = 1)
y = donors_yes['TARGET_D']


In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [72]:
#Check for Imbalance

print("Imbalance Check:")
print(y_train.value_counts())

Imbalance Check:
10.0     765
15.0     469
20.0     454
5.0      395
25.0     327
        ... 
102.0      1
55.0       1
150.0      1
4.5        1
48.0       1
Name: TARGET_D, Length: 67, dtype: int64


In [73]:
#Scaling the data

In [74]:
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

In [75]:
# Before to try the model, we will use MinMaxScaler on X train num

transformer = MinMaxScaler().fit(X_train_num) 
X_train_scaled_arr = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_num.columns)
#X_train_scaled.head()
X_train_scaled.shape

(3874, 331)

In [76]:
# Before to try the model, we will use MinMaxScaler on X test num

X_test_scaled_arr = transformer.transform(X_test_num)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
#X_test_scaled.head()
X_test_scaled.shape

(969, 331)

In [77]:
# Before to try the model, we will use OneHot on X_train_cat and in X_test_cat

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first')

# Fit and Transform X_train
encoded_train_cat = encoder.fit_transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
onehot_encoded_train = pd.DataFrame(encoded_train_cat, columns=cols)

# Transform X_test
encoder.set_params(handle_unknown='ignore')
encoded_test_cat = encoder.transform(X_test_cat).toarray()
onehot_encoded_test = pd.DataFrame(encoded_test_cat, columns=cols)

In [78]:
# Dataframe from X_train after being treated

X_train_treated = pd.concat([X_train_scaled, onehot_encoded_train], axis=1)
X_train_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.698795,0.000000,1.000000,0.0,0.0,0.139241,0.000000,0.707071,0.416667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.000051,0.361446,1.000000,1.000000,0.0,0.0,0.367089,0.474747,0.242424,0.229167,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.000026,0.722892,0.666667,1.000000,0.0,0.0,0.303797,0.232323,0.171717,0.229167,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.000000,0.361446,0.000000,0.000000,0.0,0.0,0.227848,0.515152,0.171717,0.125000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.903614,0.000000,1.000000,0.0,0.0,0.417722,0.232323,0.484848,0.104167,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3869,0.000026,0.325301,0.666667,1.000000,0.0,0.0,0.430380,0.131313,0.353535,0.104167,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3870,0.000051,0.686747,0.166667,1.000000,0.0,0.0,0.481013,0.272727,0.272727,0.145833,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3871,0.000000,0.481928,0.666667,1.000000,0.0,0.0,0.151899,0.000000,0.363636,0.125000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3872,0.000026,0.481928,0.500000,0.888889,0.0,0.0,0.316456,0.575758,0.131313,0.104167,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [79]:
# Dataframe from X_test after being treated

X_test_treated = pd.concat([X_test_scaled, onehot_encoded_test], axis=1)
X_test_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000718,0.506024,0.666667,1.000000,0.000000,0.031250,0.278481,0.323232,0.111111,0.020833,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000051,0.807229,0.166667,0.666667,0.087500,0.000000,0.291139,0.242424,0.282828,0.104167,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.000026,0.590361,0.833333,1.000000,0.000000,0.000000,0.303797,0.434343,0.222222,0.187500,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.000000,0.409639,0.166667,0.000000,0.008333,0.000000,0.620253,0.181818,0.242424,0.166667,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000051,0.698795,0.500000,0.444444,0.033333,0.000000,0.632911,0.232323,0.656566,0.354167,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,0.000000,0.120482,0.333333,0.222222,0.029167,0.000000,0.189873,0.111111,0.393939,0.125000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
965,0.000026,0.421687,0.000000,0.222222,0.016667,0.000000,0.316456,0.313131,0.323232,0.166667,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
966,0.000026,0.561586,0.833333,0.111111,0.050000,0.010417,0.379747,0.232323,0.363636,0.187500,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
967,0.000000,0.469880,0.666667,1.000000,0.000000,0.000000,0.303797,0.363636,0.202020,0.208333,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [80]:
# we can repeat the same process as before to select the features, but just to save time, we are not going to do it now. 

- Using the regression model, make predictions on all of the people our classification model predicted will donate.

In [81]:
#Now we are going to try 3 different models: DecisionTreeRegressor, LinearRegression and KNeighborsRegressor to see wich one gives us the best results


In [82]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler
import numpy as np

In [83]:
model1 = DecisionTreeRegressor()
model2 = LinearRegression()
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_treated, y_train, cv=5))
    scores[model_name] = mean_score
print(scores)

{'Decision Tree Regressor': -0.1674814071640629, 'Linear Regression': 0.23428962182062513, 'KNN': 0.16536839507326304}


In [84]:
#Even if we don't have the best results, our Linear Regression Model is the best one. 

In [None]:
#Our Regression model really isn't great at predicting the amount donated.