In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [4]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [5]:
donors = pd.concat([numerical, categorical, target], axis=1)
donors.head(5)
#donors.shape

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


# Filtering by Target B = 1

In [6]:
donors_b1 = donors[donors['TARGET_B']==1]
donors_b1
#donors_b1.shape

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,2,62.000000,3,8,10,2,25,40,27,11,...,88,1,94,4,96,3,87,1,1,4.0
30,0,61.611649,5,9,0,1,37,58,16,8,...,90,4,93,1,95,12,90,4,1,7.0
45,0,66.000000,5,9,5,0,33,24,39,6,...,93,12,94,4,96,2,87,4,1,5.0
78,0,69.000000,6,9,0,0,34,20,54,2,...,90,1,95,3,95,11,90,1,1,13.0
93,1,73.000000,1,7,10,0,21,53,8,5,...,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,89,6,96,1,96,1,86,8,1,20.0
95309,0,51.000000,5,6,1,1,32,43,24,7,...,93,10,94,2,95,12,93,10,1,15.0
95398,0,86.000000,5,9,0,1,32,21,26,9,...,89,6,95,11,96,2,87,11,1,3.0
95403,0,58.000000,4,9,0,0,24,46,20,6,...,90,3,93,12,96,1,90,3,1,10.0


# X, y Split

In [7]:
X_1= donors_b1.drop(columns=['TARGET_B','TARGET_D'])
y_1= donors_b1['TARGET_D']

In [8]:
print(X_1.shape)
print(y_1.shape)

(4843, 337)
(4843,)


In [9]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2)

In [10]:
# Check the number of rows
print(X_train_1.shape)
print(X_test_1.shape)
print(y_train_1.shape)
print(y_test_1.shape)

(3874, 337)
(969, 337)
(3874,)
(969,)


In [11]:
X_train_num_1 = X_train_1.select_dtypes(include = np.number)
X_test_num_1 = X_test_1.select_dtypes(include = np.number)
X_train_cat_1 = X_train_1.select_dtypes(include = object)
X_test_cat_1 = X_test_1.select_dtypes(include = object)

# Scaler numerical_B1

In [12]:
from sklearn.preprocessing import MinMaxScaler

transformer_1 = MinMaxScaler().fit(X_train_num_1) 
X_train_scaled_arr_1 = transformer_1.transform(X_train_num_1)
X_train_scaled_1 = pd.DataFrame(X_train_scaled_arr_1, columns=X_train_num_1.columns)
#X_train_scaled_1.head()
X_train_scaled_1.shape

(3874, 330)

In [13]:
X_test_scaled_arr_1 = transformer_1.transform(X_test_num_1)
X_test_scaled_1 = pd.DataFrame(X_test_scaled_arr_1, columns=X_test_num_1.columns)
#X_test_scaled_1.head()
X_test_scaled_1.shape

(969, 330)

In [14]:
from sklearn.preprocessing import OneHotEncoder

encoder_1 = OneHotEncoder(drop='first')

# Fit and Transform X_train
encoded_train_cat_1 = encoder_1.fit_transform(X_train_cat_1).toarray()
cols_1 = encoder_1.get_feature_names_out(input_features=X_train_cat_1.columns)
onehot_encoded_train_1 = pd.DataFrame(encoded_train_cat_1, columns=cols_1)

# Transform X_test
encoder_1.set_params(handle_unknown='ignore')
encoded_test_cat_1 = encoder_1.transform(X_test_cat_1).toarray()
onehot_encoded_test_1 = pd.DataFrame(encoded_test_cat_1, columns=cols_1)

In [15]:
# Dataframe from X_train
X_train_treated_1 = pd.concat([X_train_scaled_1, onehot_encoded_train_1], axis=1)
X_train_treated_1
#X_train_treated_1.shape

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000051,0.936170,0.000000,1.000000,0.000000,0.000000,0.455696,0.131313,0.434343,0.166667,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.000026,0.776596,0.666667,1.000000,0.000000,0.000000,0.392405,0.242424,0.363636,0.312500,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.000000,0.612890,0.166667,0.333333,0.000000,0.010753,0.443038,0.494949,0.171717,0.083333,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000026,0.612890,0.833333,0.666667,0.000000,0.000000,0.443038,0.252525,0.373737,0.145833,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.000026,0.612890,1.000000,1.000000,0.000000,0.010753,0.531646,0.393939,0.232323,0.229167,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3869,0.000051,0.702128,1.000000,1.000000,0.000000,0.064516,0.392405,0.404040,0.333333,0.020833,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3870,0.000000,0.776596,0.666667,1.000000,0.000000,0.000000,0.329114,0.121212,0.505051,0.166667,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3871,0.000026,0.553191,1.000000,1.000000,0.000000,0.000000,0.341772,0.707071,0.161616,0.020833,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3872,0.000000,0.351064,0.333333,0.000000,0.070833,0.000000,0.518987,0.000000,1.000000,0.229167,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [16]:
# Dataframe from X_test
X_test_treated_1 = pd.concat([X_test_scaled_1, onehot_encoded_test_1], axis=1)
X_test_treated_1
#X_test_treated_1.shape

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.612890,0.666667,1.000000,0.000000,0.075269,0.367089,0.363636,0.222222,0.062500,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.000000,0.872340,0.166667,0.111111,0.004167,0.000000,0.151899,0.292929,0.464646,0.187500,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.000026,0.612890,0.666667,1.000000,0.000000,0.000000,0.645570,0.202020,0.595960,0.187500,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.612890,0.666667,1.000000,0.000000,0.000000,0.113924,0.000000,0.505051,0.187500,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.000026,0.819149,0.666667,0.666667,0.012500,0.000000,0.240506,0.363636,0.191919,0.166667,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,0.000051,0.702128,0.666667,0.777778,0.033333,0.064516,0.481013,0.585859,0.202020,0.166667,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
965,0.025691,0.542553,0.666667,1.000000,0.000000,0.000000,0.240506,0.666667,0.080808,0.104167,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
966,0.000000,0.404255,0.833333,1.000000,0.004167,0.000000,0.278481,0.383838,0.202020,0.145833,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
967,0.000026,0.612890,1.000000,1.000000,0.000000,0.000000,0.303797,0.191919,0.434343,0.145833,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


# Concatenating the scaled and normalized X train and y train

In [17]:
y_train_1.reset_index(drop=True, inplace=True)

In [18]:
y_test_1.reset_index(drop=True, inplace=True)

In [19]:
train_set = pd.concat([X_train_treated_1, y_train_1], axis=1)
train_set

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_D
0,0.000051,0.936170,0.000000,1.000000,0.000000,0.000000,0.455696,0.131313,0.434343,0.166667,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,7.0
1,0.000026,0.776596,0.666667,1.000000,0.000000,0.000000,0.392405,0.242424,0.363636,0.312500,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0
2,0.000000,0.612890,0.166667,0.333333,0.000000,0.010753,0.443038,0.494949,0.171717,0.083333,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
3,0.000026,0.612890,0.833333,0.666667,0.000000,0.000000,0.443038,0.252525,0.373737,0.145833,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21.0
4,0.000026,0.612890,1.000000,1.000000,0.000000,0.010753,0.531646,0.393939,0.232323,0.229167,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3869,0.000051,0.702128,1.000000,1.000000,0.000000,0.064516,0.392405,0.404040,0.333333,0.020833,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0
3870,0.000000,0.776596,0.666667,1.000000,0.000000,0.000000,0.329114,0.121212,0.505051,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0
3871,0.000026,0.553191,1.000000,1.000000,0.000000,0.000000,0.341772,0.707071,0.161616,0.020833,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,15.0
3872,0.000000,0.351064,0.333333,0.000000,0.070833,0.000000,0.518987,0.000000,1.000000,0.229167,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,5.0


# Model comparation B1

In [20]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor()
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_treated_1, y_train_1, cv=5))
    scores[model_name] = mean_score
print(scores)

{'Decision Tree Regressor': 0.047751683984751644, 'Linear Regression': 0.3664730125716137, 'KNN': 0.14657893669406058}


# Validation with the cross validation score

In [21]:
val_scores = {}
for model, model_name in zip(model_pipeline,model_names):
    model.fit(X_train_treated_1, y_train_1)
    val_scores[model_name] = model.score(X_train_treated_1, y_train_1)
print(val_scores)

{'Decision Tree Regressor': 1.0, 'Linear Regression': 0.6064388651396942, 'KNN': 0.4276061982958108}


# Feature Selection

   * **Applying the variance on the dataset**

In [22]:
drop_list= ['TCODE', 'HIT', 'MALEMILI', 'MALEVET', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HU3', 'HU4', 'HHD1', 'HHD4', 'HHD5', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC2', 'ETHC3', 'ETHC4', 'ETHC5', 'ETHC6', 'HUR1', 'RHP1', 'RHP2', 'RHP3', 'RHP4', 'HUPA1', 'HUPA4', 'HUPA5', 'HUPA7', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'HHAS2', 'HHAS4', 'MC3', 'TPE1', 'TPE2', 'TPE3', 'TPE4', 'TPE5', 'TPE6', 'TPE7', 'TPE8', 'TPE9', 'PEC1', 'TPE10', 'TPE11', 'TPE12', 'LFC1', 'LFC3', 'LFC5', 'LFC10', 'OCC1', 'OCC2', 'OCC3', 'OCC4', 'OCC5', 'OCC6', 'OCC7', 'OCC8', 'OCC9', 'OCC10', 'OCC11', 'OCC12', 'OCC13', 'EIC1', 'EIC2', 'EIC3', 'EIC4', 'EIC5', 'EIC6', 'EIC7', 'EIC8', 'EIC9', 'EIC10', 'EIC11', 'EIC12', 'EIC13', 'EIC14', 'EIC15', 'EIC16', 'OEDC1', 'OEDC2', 'OEDC3', 'OEDC4', 'OEDC5', 'OEDC6', 'OEDC7', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6', 'EC7', 'EC8', 'SEC1', 'SEC2', 'SEC3', 'SEC4', 'SEC5', 'AFC1', 'AFC2', 'AFC3', 'AFC4', 'AFC5', 'AFC6', 'VC2', 'VC4', 'ANC1', 'ANC2', 'ANC3', 'ANC4', 'ANC5', 'ANC6', 'ANC7', 'ANC8', 'ANC9', 'ANC10', 'ANC11', 'ANC12', 'ANC13', 'ANC14', 'ANC15', 'POBC1', 'LSC2', 'LSC3', 'LSC4', 'VOC1', 'VOC3', 'HC1', 'HC3', 'HC9', 'HC10', 'HC12', 'HC14', 'HC15', 'HC16', 'HC20', 'HC21', 'AC1', 'AC2', 'NUMPROM', 'CARDPM12', 'NUMPRM12', 'RAMNTALL', 'NGIFTALL', 'CARDGIFT', 'MINRAMNT', 'MAXRAMNT', 'LASTGIFT', 'TIMELAG', 'AVGGIFT', 'ODATEW_MM', 'MINRDATE_YR', 'MAXRDATE_YR', 'FIRSTDATE_YR']

In [23]:
#Droppping the columns on the drio list var
train_set_selected = train_set.drop(drop_list, axis=1).reset_index(drop=True)

In [24]:
test_set_selected = X_test_treated_1.drop(drop_list, axis=1).reset_index(drop=True)

In [25]:
#X_test_treated_1.columns

In [26]:
#train_set.columns

# Compare models B1 after reducing features

In [27]:
X_train_1 = train_set_selected.drop(columns=['TARGET_D'])
y_train_1 = train_set_selected['TARGET_D'] 

In [28]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor()
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_1, y_train_1, cv=5))
    scores[model_name] = mean_score
print(scores)

{'Decision Tree Regressor': -0.2794731106073943, 'Linear Regression': 0.3509570773438523, 'KNN': 0.17613537870402954}


# Validation without cross value

In [29]:
val_scores = {}
for model, model_name in zip(model_pipeline,model_names):
    model.fit(X_train_1, y_train_1)
    val_scores[model_name] = model.score(X_train_1, y_train_1)
print(val_scores)

{'Decision Tree Regressor': 1.0, 'Linear Regression': 0.40568052937169685, 'KNN': 0.44803035598274044}


# Lineal Regression Model

In [30]:
#Check number of rows
print(X_train_1.shape) 
print(X_test_1.shape)
print(y_train_1.shape)
print(y_test_1.shape)

(3874, 113)
(969, 337)
(3874,)
(969,)


In [31]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
lm.fit(X_train_1, y_train_1)

In [32]:
from sklearn.metrics import r2_score
predictions_rm = lm.predict(X_train_1)
len(predictions_rm)

3874

In [33]:
r2_score(y_train_1, predictions_rm)

0.40568052937169685

In [34]:
# Check in TEST set

In [35]:
predictions_test_rm = lm.predict(test_set_selected)
len(predictions_test_rm)

969

In [36]:
#list(predictions_test_rm)

# Review metrics for regression 

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_train_1, predictions_rm)
mae = mean_absolute_error(y_train_1, predictions_rm)
r2 = r2_score(y_train_1, predictions_rm)

print('Mean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared:', r2)

Mean Squared Error (MSE): 93.6906670959934
Mean Absolute Error (MAE): 5.085742287802536
R-squared: 0.40568052937169685


# Trying with DecisionTreeRegressor

In [38]:
from sklearn.tree import DecisionTreeRegressor

In [39]:
regr = DecisionTreeRegressor(max_depth=5)
regr.fit(X_train_1, y_train_1)

In [40]:
print("test data R2 score was: ",regr.score(test_set_selected, y_test_1))
print("train data R2 score was: ",regr.score(X_train_1, y_train_1))

test data R2 score was:  0.07797925021461793
train data R2 score was:  0.5363762110247615


# RandomForestRegressor

In [50]:
from sklearn.ensemble import RandomForestRegressor

forest_regressor = RandomForestRegressor(max_depth=10, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8,
                             random_state = 42)

In [55]:
forest_regressor.fit(X_train_1, y_train_1)
print(forest_regressor.score(X_train_1, y_train_1))

0.49870073219049604


# Test the model with my Predictions dataset

In [43]:
donors_with_targetB_pred = pd.read_csv('donors_with_targetB_predictions.csv')
donors_with_targetB_pred = donors_with_targetB_pred.drop(donors_with_targetB_pred['TARGET_B'])
donors_with_targetB_pred

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,PREDICTIONS,TARGET_B
2,0.628866,0.666667,1.000000,0.333333,0.383838,1.000000,1.000000,0.000000,0.000000,0.858586,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
3,0.546392,0.833333,0.555556,0.646465,0.131313,0.555556,1.000000,0.000000,0.000000,0.949495,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
4,0.624862,0.666667,1.000000,0.292929,0.383838,0.000000,0.000000,0.000000,1.000000,0.989899,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0
5,0.624862,0.666667,1.000000,0.323232,0.292929,1.000000,1.000000,0.000000,0.000000,0.919192,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
6,0.422680,0.666667,0.666667,0.333333,0.333333,1.000000,0.000000,0.525253,0.484848,0.767677,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164006,0.463918,0.166667,1.000000,0.313131,0.191919,0.444444,0.191919,0.000000,0.818182,0.979798,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
164007,0.628866,0.000000,1.000000,0.303030,0.202020,0.888889,0.000000,0.000000,1.000000,0.989899,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,1
164008,0.340206,0.166667,1.000000,0.424242,0.232323,1.000000,1.000000,0.000000,0.000000,0.878788,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
164009,0.731959,0.500000,1.000000,0.131313,0.434343,1.000000,1.000000,0.000000,0.000000,0.151515,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0


In [44]:
donors_with_targetB_pred.shape

(164009, 115)

In [45]:
donors_with_targetB_pred.PREDICTIONS.value_counts()

0    109005
1     55004
Name: PREDICTIONS, dtype: int64

   * Get just the Predictions == 1

In [46]:
filtered_df = donors_with_targetB_pred[donors_with_targetB_pred["PREDICTIONS"] == 1]
filtered_df

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,PREDICTIONS,TARGET_B
2,0.628866,0.666667,1.000000,0.333333,0.383838,1.000000,1.0,0.0,0.0,0.858586,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
7,0.432990,0.666667,1.000000,0.282828,0.222222,1.000000,0.0,0.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
15,0.505155,0.666667,1.000000,0.171717,0.090909,1.000000,1.0,0.0,0.0,0.484848,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1
16,0.577320,0.000000,0.000000,0.000000,0.696970,0.000000,1.0,0.0,0.0,0.808081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1
26,0.298969,0.666667,0.888889,0.333333,0.393939,1.000000,1.0,0.0,0.0,0.919192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163901,0.835052,0.500000,0.666667,0.181818,0.585859,0.666667,1.0,0.0,0.0,0.979798,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
163902,0.556701,0.833333,1.000000,0.606061,0.010101,1.000000,1.0,0.0,0.0,0.909091,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0
163913,0.624862,0.666667,1.000000,0.000000,0.000000,0.222222,1.0,0.0,0.0,0.888889,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
163917,0.567010,0.666667,1.000000,0.242424,0.353535,1.000000,1.0,0.0,0.0,0.888889,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0


In [68]:
filtered_df['PREDICTIONS'].value_counts()

1    55004
Name: PREDICTIONS, dtype: int64

   + predict the amount of donations for the predictions

In [57]:
filtered_pred = filtered_df.drop(["PREDICTIONS",'TARGET_B'], axis = 1)
filtered_pred

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
2,0.628866,0.666667,1.000000,0.333333,0.383838,1.000000,1.0,0.0,0.0,0.858586,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.432990,0.666667,1.000000,0.282828,0.222222,1.000000,0.0,0.0,1.0,1.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15,0.505155,0.666667,1.000000,0.171717,0.090909,1.000000,1.0,0.0,0.0,0.484848,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
16,0.577320,0.000000,0.000000,0.000000,0.696970,0.000000,1.0,0.0,0.0,0.808081,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26,0.298969,0.666667,0.888889,0.333333,0.393939,1.000000,1.0,0.0,0.0,0.919192,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163901,0.835052,0.500000,0.666667,0.181818,0.585859,0.666667,1.0,0.0,0.0,0.979798,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
163902,0.556701,0.833333,1.000000,0.606061,0.010101,1.000000,1.0,0.0,0.0,0.909091,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
163913,0.624862,0.666667,1.000000,0.000000,0.000000,0.222222,1.0,0.0,0.0,0.888889,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
163917,0.567010,0.666667,1.000000,0.242424,0.353535,1.000000,1.0,0.0,0.0,0.888889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [60]:
amount_predictions = forest_regressor.predict(filtered_pred)
amount_predictions

array([ 9.5372564 ,  9.04734484, 34.74502399, ...,  9.47441609,
        5.51767509, 15.7469981 ])

In [61]:
filtered_df["AMOUNT_PRED"] = amount_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["AMOUNT_PRED"] = amount_predictions


In [66]:
filtered_df.drop(['TARGET_B'], axis = 1)

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,PREDICTIONS,AMOUNT_PRED
2,0.628866,0.666667,1.000000,0.333333,0.383838,1.000000,1.0,0.0,0.0,0.858586,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,9.537256
7,0.432990,0.666667,1.000000,0.282828,0.222222,1.000000,0.0,0.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,9.047345
15,0.505155,0.666667,1.000000,0.171717,0.090909,1.000000,1.0,0.0,0.0,0.484848,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,34.745024
16,0.577320,0.000000,0.000000,0.000000,0.696970,0.000000,1.0,0.0,0.0,0.808081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,17.665949
26,0.298969,0.666667,0.888889,0.333333,0.393939,1.000000,1.0,0.0,0.0,0.919192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10.254748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163901,0.835052,0.500000,0.666667,0.181818,0.585859,0.666667,1.0,0.0,0.0,0.979798,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,14.582138
163902,0.556701,0.833333,1.000000,0.606061,0.010101,1.000000,1.0,0.0,0.0,0.909091,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,8.809422
163913,0.624862,0.666667,1.000000,0.000000,0.000000,0.222222,1.0,0.0,0.0,0.888889,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,9.474416
163917,0.567010,0.666667,1.000000,0.242424,0.353535,1.000000,1.0,0.0,0.0,0.888889,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,5.517675


In [67]:
filtered_df.to_csv("donors_with_predictions.csv", index = False)