In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')
display(numerical.head(5))
display(categorical.head(5))
display(target.head(5))

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [3]:
donors = pd.concat([numerical, categorical, target], axis=1)
donors.head(5)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


In [4]:
# Confirming that is there any nulls.
donors.isna().sum().sum()

0

   + Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

In [5]:
X = donors.drop(columns=['TARGET_B','TARGET_D'])
y = donors['TARGET_B']

In [6]:
print(X.shape)
print(y.shape)

(95412, 337)
(95412,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
#Imbalance check
print("Imbalance Check:")
print(y_train.value_counts())

Imbalance Check:
0    72419
1     3910
Name: TARGET_B, dtype: int64


In [9]:
train_data = pd.concat([X_train, y_train], axis=1)

In [10]:
from sklearn.utils import resample
class_0 = train_data[train_data['TARGET_B'] == 0]
class_1 = train_data[train_data['TARGET_B'] == 1]

In [11]:
class_1_upsampled = resample(class_1, 
                                   replace=True,    
                                   n_samples=len(class_0))

In [12]:
upsampled_data = pd.concat([class_0, class_1_upsampled])
upsampled_data = upsampled_data.sample(frac =1)

In [13]:
upsampled_data['TARGET_B'].value_counts()

1    72419
0    72419
Name: TARGET_B, dtype: int64

In [14]:
upsampled_data.shape

(144838, 338)

In [15]:
X_train_upsampled = upsampled_data.drop('TARGET_B', axis=1)
y_train_upsampled = upsampled_data['TARGET_B']

In [16]:
# Check the number of rows
print(X_train_upsampled.shape)
print(X_test.shape)
print(y_train_upsampled.shape)
print(y_test.shape)

(144838, 337)
(19083, 337)
(144838,)
(19083,)


In [17]:
X_train_num = X_train_upsampled.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train_upsampled.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

In [18]:
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num) 
X_train_scaled_arr = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_num.columns)
#X_train_scaled.head()
X_train_scaled.shape

(144838, 330)

In [19]:
transformer = MinMaxScaler().fit(X_test_num) 
X_test_scaled_arr = transformer.transform(X_test_num)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
#X_test_scaled.head()
X_test_scaled.shape

(19083, 330)

In [20]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first')

# Fit and Transform X_train
encoded_train_cat = encoder.fit_transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
onehot_encoded_train = pd.DataFrame(encoded_train_cat, columns=cols)

# Transform X_test
encoder.set_params(handle_unknown='ignore')
encoded_test_cat = encoder.transform(X_test_cat).toarray()
onehot_encoded_test = pd.DataFrame(encoded_test_cat, columns=cols)

In [21]:
# Dataframe from X_train
X_train_treated = pd.concat([X_train_scaled, onehot_encoded_train], axis=1)
X_train_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000014,0.804124,0.666667,0.888889,0.008299,0.0,0.252525,0.363636,0.363636,0.050505,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.013916,0.624862,0.500000,0.111111,0.000000,0.0,0.393939,0.262626,0.494949,0.090909,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.000389,0.624862,0.666667,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.000000,0.556701,0.666667,1.000000,0.000000,0.0,0.282828,0.545455,0.202020,0.080808,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000028,0.628866,1.000000,0.888889,0.029046,0.0,0.090909,0.484848,0.000000,0.050505,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144833,0.000014,0.515464,1.000000,1.000000,0.000000,0.0,0.424242,0.222222,0.383838,0.121212,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
144834,0.000028,0.624862,0.166667,0.444444,0.016598,0.0,0.262626,0.343434,0.131313,0.060606,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
144835,0.000000,0.670103,0.000000,0.333333,0.004149,0.0,0.292929,0.171717,0.404040,0.040404,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
144836,0.000028,0.443299,0.500000,1.000000,0.000000,0.0,0.424242,0.050505,0.686869,0.040404,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# Dataframe from X_test
X_test_treated = pd.concat([X_test_scaled, onehot_encoded_test], axis=1)
X_test_treated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.624862,0.666667,1.000000,0.000000,0.000000,0.404040,0.272727,0.303030,0.118644,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.000026,0.624862,0.166667,1.000000,0.000000,0.000000,0.262626,0.313131,0.181818,0.101695,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.000718,0.865979,0.833333,1.000000,0.000000,0.000000,0.111111,0.202020,0.383838,0.016949,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.000051,0.624862,0.500000,0.666667,0.012448,0.000000,0.292929,0.656566,0.101010,0.135593,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.000718,0.624862,0.666667,1.000000,0.000000,0.000000,0.585859,0.212121,0.484848,0.152542,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.000026,0.690722,0.833333,1.000000,0.000000,0.000000,0.262626,0.242424,0.343434,0.084746,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19079,0.000000,0.624862,0.000000,0.333333,0.016598,0.000000,0.363636,0.242424,0.383838,0.220339,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19080,0.000000,0.690722,0.166667,0.555556,0.120332,0.000000,0.333333,0.101010,0.515152,0.067797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19081,0.000000,0.855670,0.333333,0.333333,0.078838,0.222222,0.444444,0.282828,0.222222,0.118644,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [23]:
#X_train_upsampled.head()
#X_train_upsampled.dtypes

In [24]:
y_train_upsampled.head()

81685    1
15885    0
80128    1
85862    0
23709    0
Name: TARGET_B, dtype: int64

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_treated, y_train_upsampled)
print(clf.score(X_train_treated, y_train_upsampled))
print(clf.score(X_test_treated, y_test))


0.6228752123061627
0.6904574752397422


In [26]:
y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0    18150
1      933
Name: TARGET_B, dtype: int64

array([[12762,  5388],
       [  519,   414]], dtype=int64)

   + Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [27]:
X_train_treated.reset_index(drop=True, inplace=True)
y_train_upsampled.reset_index(drop=True, inplace=True)
X_test_treated.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [38]:
data_train = pd.concat([X_train_treated, y_train_upsampled], axis=1)
data_train.head(5)
#print(data_train.isna().sum().sum())

0


In [40]:
data_test = pd.concat([X_test_treated, y_test], axis=1)
data_test.head(5)
#print(data_test.isna().sum().sum())

0


In [43]:
data_fea = pd.concat([data_train,data_test], axis=0)
data_fea

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.000014,0.804124,0.666667,0.888889,0.008299,0.000000,0.252525,0.363636,0.363636,0.050505,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,0.013916,0.624862,0.500000,0.111111,0.000000,0.000000,0.393939,0.262626,0.494949,0.090909,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,0.000389,0.624862,0.666667,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
3,0.000000,0.556701,0.666667,1.000000,0.000000,0.000000,0.282828,0.545455,0.202020,0.080808,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.000028,0.628866,1.000000,0.888889,0.029046,0.000000,0.090909,0.484848,0.000000,0.050505,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.000026,0.690722,0.833333,1.000000,0.000000,0.000000,0.262626,0.242424,0.343434,0.084746,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
19079,0.000000,0.624862,0.000000,0.333333,0.016598,0.000000,0.363636,0.242424,0.383838,0.220339,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
19080,0.000000,0.690722,0.166667,0.555556,0.120332,0.000000,0.333333,0.101010,0.515152,0.067797,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
19081,0.000000,0.855670,0.333333,0.333333,0.078838,0.222222,0.444444,0.282828,0.222222,0.118644,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [44]:
data_fea.isna().sum().sum()

0

In [45]:
X_data_fea = data_fea.drop('TARGET_B', axis=1)
y_data_fea = data_fea['TARGET_B']

In [48]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

logreg = LogisticRegression(max_iter=10)
rfe = RFE(logreg, n_features_to_select=20, verbose=False)
rfe.fit(X_data_fea, y_data_fea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [96]:
rfe.ranking_

array([275, 252,  21,  77, 181, 333, 197, 190, 163,  12,  51,  30, 168,
       245, 330, 180, 196, 155, 159,  66, 303,  95, 164, 272, 183, 313,
       118, 307, 179, 247, 276, 259, 325, 335, 278, 249, 316, 305, 182,
       231,  44,  36,  27,  93, 207,  69, 170, 134,   5,   1, 334, 234,
         4,  68, 293, 214,  72, 271, 122,  45, 269, 105, 292, 187, 150,
        91,  28,  41, 301, 254,  86, 109,   8,   1,  97,  96, 123, 161,
       113, 104, 232, 177, 290,  35,   7, 195, 194, 312, 209, 299, 219,
       167, 142,  37,   1, 112,   9, 311,  16,  14,  17, 319, 255,  67,
         3, 144, 165, 107, 110, 136,  78, 148, 115,  79,  20, 102, 185,
        32,  50, 169,   1,   1, 331, 137,  56,  62, 151, 140,  82, 251,
       149, 192, 114, 270,  71,   1, 193, 191,  59,  73, 132, 236, 280,
       224, 250, 318,  29,  61, 233, 258, 205,  64, 217, 328, 228,  18,
        65, 257, 241,  52, 329, 154, 152,  54, 237, 126,  58, 323,  55,
        84, 218, 267, 227, 296, 208, 262, 139, 189,  11,  39, 16

In [78]:
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(X_data_fea).columns
df

Unnamed: 0,Rank,Column_name
0,275,TCODE
1,252,AGE
2,21,INCOME
3,77,WEALTH1
4,181,HIT
...,...,...
349,100,GEOCODE2_D
350,327,DOMAIN_A_R
351,147,DOMAIN_A_S
352,160,DOMAIN_A_T


In [98]:
selected_features = df[df['Rank']==1]
selected_features
#len(selected_features)

Unnamed: 0,Rank,Column_name
49,1,AGEC3
73,1,HHP2
94,1,HHD3
120,1,RHP3
121,1,RHP4
135,1,DMA
193,1,OCC3
195,1,OCC5
198,1,OCC8
200,1,OCC10


In [83]:
selected_features = selected_features['Column_name']
col_rfe = data_fea[selected_features]
col_rfe

Unnamed: 0,AGEC3,HHP2,HHD3,RHP3,RHP4,DMA,OCC3,OCC5,OCC8,OCC10,EIC9,OEDC1,SEC3,AFC4,ANC7,AC2,NGIFTALL,RFA_2F,ODATEW_YR,MINRDATE_YR
0,0.232323,0.407143,0.727273,0.245902,0.100000,0.573212,0.020202,0.131313,0.111111,0.141414,0.080808,0.050505,0.066667,0.131313,0.020202,0.060606,0.059322,0.000000,0.230769,0.772727
1,0.222222,0.365714,0.686869,0.229508,0.100000,0.606129,0.020202,0.141414,0.090909,0.030303,0.121212,0.090909,0.066667,0.181818,0.080808,0.060606,0.012712,0.666667,0.846154,0.863636
2,0.202020,0.340000,0.505051,0.213115,0.100000,0.683314,0.040404,0.161616,0.121212,0.121212,0.050505,0.080808,0.066667,0.161616,0.040404,0.080808,0.055085,1.000000,0.538462,0.727273
3,0.181818,0.381429,0.494949,0.229508,0.125000,0.770715,0.010101,0.262626,0.131313,0.202020,0.101010,0.080808,0.100000,0.131313,0.101010,0.030303,0.025424,0.000000,0.384615,0.636364
4,0.222222,0.402857,0.505051,0.229508,0.125000,0.814983,0.000000,0.101010,0.080808,0.020202,0.020202,0.050505,0.033333,0.040404,0.040404,0.070707,0.050847,0.000000,0.153846,0.545455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.349206,0.586998,0.838384,0.516129,0.142857,0.573212,0.111111,0.147059,0.046875,0.094340,0.170213,0.084746,0.230769,0.218750,0.044444,0.160000,0.086420,0.000000,0.500000,0.533333
19079,0.365079,0.543021,0.525253,0.483871,0.238095,0.611805,0.037037,0.338235,0.187500,0.207547,0.042553,0.220339,0.153846,0.234375,0.111111,0.120000,0.074074,0.000000,0.500000,0.800000
19080,0.253968,0.543021,0.666667,0.483871,0.238095,0.683314,0.148148,0.485294,0.062500,0.169811,0.148936,0.067797,0.000000,0.234375,0.133333,0.160000,0.185185,1.000000,0.214286,0.466667
19081,0.285714,0.466539,0.606061,0.419355,0.190476,0.686720,0.000000,0.250000,0.109375,0.207547,0.063830,0.338983,0.230769,0.171875,0.111111,0.020000,0.061728,0.000000,0.571429,0.600000


   + Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

In [84]:
X_col_rfe = col_rfe
y_col_rfe = data_fea['TARGET_B']

In [88]:
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_col_rfe, y_col_rfe, test_size=0.2)

In [89]:
clf_rfe = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf_rfe.fit(X_train_rfe, y_train_rfe)
print(clf_rfe.score(X_train_rfe, y_train_rfe))
print(clf_rfe.score(X_test_rfe, y_test_rfe))


0.6348523670082967
0.6326673783742566


# VarianceThreshold

In [90]:
from sklearn.feature_selection import VarianceThreshold

In [91]:
X_data_fea = data_fea.drop('TARGET_B', axis=1)
y_data_fea = data_fea['TARGET_B']

In [92]:
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

In [93]:
sel = sel.fit(X_data_fea)

In [100]:
sel.get_support()
var_list=list(sel.get_support())
droplist_var=[col[0] for col in zip(X_data_fea.columns, var_list) if col[1] == False]
print(droplist_var)
len(droplist_var)

['TCODE', 'HIT', 'MALEMILI', 'MALEVET', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HU3', 'HU4', 'HHD1', 'HHD4', 'HHD5', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC3', 'ETHC4', 'ETHC5', 'ETHC6', 'HUR1', 'RHP1', 'RHP2', 'RHP3', 'RHP4', 'HUPA1', 'HUPA4', 'HUPA5', 'HUPA7', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'H

240

In [105]:
X_data_fea_1 = X_data_fea.drop(droplist_var, axis=1)
X_data_fea_1

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.804124,0.666667,0.888889,0.363636,0.363636,0.888889,1.0,0.000000,0.000000,1.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.624862,0.500000,0.111111,0.262626,0.494949,1.000000,1.0,0.000000,0.000000,0.989899,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.624862,0.666667,1.000000,0.000000,0.000000,1.000000,0.0,0.323232,0.686869,0.949495,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.556701,0.666667,1.000000,0.545455,0.202020,0.666667,1.0,0.000000,0.000000,0.898990,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.628866,1.000000,0.888889,0.484848,0.000000,0.555556,0.0,0.959596,0.050505,0.535354,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.690722,0.833333,1.000000,0.242424,0.343434,1.000000,1.0,0.000000,0.000000,0.909091,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19079,0.624862,0.000000,0.333333,0.242424,0.383838,1.000000,1.0,0.000000,0.000000,0.868687,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19080,0.690722,0.166667,0.555556,0.101010,0.515152,0.555556,1.0,0.000000,0.000000,0.959596,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19081,0.855670,0.333333,0.333333,0.282828,0.222222,0.555556,0.0,1.000000,0.000000,1.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [120]:
len(data_fea['TARGET_B'])

163921

# Ramdonclassifer

In [106]:
X_var = X_data_fea_1
y_var = data_fea['TARGET_B']

In [107]:
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(X_var, y_var, test_size=0.2)

In [119]:
clf_rfe = RandomForestClassifier(max_depth=10, 
                             min_samples_split=20, 
                             min_samples_leaf =20, 
                             max_samples=0.8)
clf_rfe.fit(X_train_var, y_train_var)
print(clf_rfe.score(X_train_var, y_train_var))
print(clf_rfe.score(X_test_var, y_test_var))

0.8013741459248414
0.7872197651364954


   * Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

In [115]:
predictions = clf_rfe.predict(X_test_var)
predictions

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [118]:
from sklearn.metrics import confusion_matrix

In [117]:
array = confusion_matrix(y_test_var, predictions)
print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',array[0][0],'|   ', array[0][1])
print('---------------------------------')
print('           B | ',array[1][0],' |   ', array[1][1])


             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  15911 |    2223
---------------------------------
           B |  7475  |    7176
