In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, classification_report

In [None]:
pd.set_option('display.max_columns', None)

## Reading the data

In [None]:
cat = pd.read_csv('categorical.csv')

In [None]:
num = pd.read_csv('numerical.csv')

In [None]:
target = pd.read_csv('target.csv')

In [None]:
cat.shape

In [None]:
num.shape

In [None]:
target.shape

In [None]:
target.nunique(dropna=False)

In [None]:
target['TARGET_B'].value_counts()

In [None]:
target['TARGET_B']==1

In [None]:
target['TARGET_D'] 

In [None]:
sns.distplot(target['TARGET_D'], bins=10);

### Calculating the average donation

In [None]:
target.groupby(['TARGET_B']).mean('TARGET_D') 

The average donation is 15.6 $ that we can use as a proxy to measure the cost of making a mistake in predicting. 

### Checking for missing values

In [None]:
nulls_percent_num= pd.DataFrame(num.isna().sum()/len(num)).reset_index()
nulls_percent_num.columns = ['column_name', 'nulls_percentage']
nulls_percent_num

In [None]:
nulls_percent_num[nulls_percent_num['nulls_percentage']!=0]

In [None]:
nulls_percent_cat= pd.DataFrame(num.isna().sum()/len(num)).reset_index()
nulls_percent_cat.columns = ['column_name', 'nulls_percentage']
nulls_percent_cat

In [None]:
nulls_percent_cat[nulls_percent_cat['nulls_percentage']!=0]

In [None]:
num.isna().value_counts

In [None]:
np.where(pd.isnull(num))

In [None]:
np.where(num.applymap(lambda x: x == ''))

## Dummifying the categorical data

In [None]:
cat.dtypes

In [None]:
cat.nunique(dropna=False)

In [None]:
cat[ 'STATE'].unique()

In [None]:
cat[ 'HOMEOWNR'].value_counts()

In [None]:
cat[ 'GENDER'].unique()

In [None]:
cat[ 'RFA_2R'].unique() #we drop this column.

In [None]:
cat[ 'RFA_2A'].unique()

In [None]:
cat[ 'GEOCODE2'].unique()

In [None]:
cat[ 'DOMAIN_A'].unique()

In [None]:
cat[['STATE','HOMEOWNR','GENDER', 'GEOCODE2', 'RFA_2A', 'DOMAIN_A']]


In [None]:
encoder0 = OneHotEncoder()

encoder0.fit(cat[['STATE']])
col_transf0 = encoder0.transform(cat[['STATE']]).toarray()
STATE = pd.DataFrame(col_transf0, columns=encoder0.categories_[0], dtype=int)
STATE

In [None]:
encoder = OneHotEncoder()
type(encoder)

In [None]:
encoder.fit(cat[['HOMEOWNR']])

In [None]:
encoder.categories_[0]

In [None]:
col_transf = encoder.transform(cat[['HOMEOWNR']]).toarray()
col_transf

In [None]:
col_transf.shape

In [None]:
len(encoder.categories_[0])

In [None]:
HOMEOWNR = pd.DataFrame(col_transf, columns=encoder.categories_[0], dtype=int)
HOMEOWNR

In [None]:
encoder1 = OneHotEncoder()
type(encoder1)

In [None]:
encoder1.fit(cat[['GENDER']])

In [None]:



col_transf1 = encoder1.transform(cat[['GENDER']]).toarray()
col_transf1

In [None]:
GENDER = pd.DataFrame(col_transf1, columns=encoder1.categories_[0], dtype=int)
GENDER

In [None]:
encoder2 = OneHotEncoder()

encoder2.fit(cat[['GEOCODE2']])
col_transf2 = encoder2.transform(cat[['GEOCODE2']]).toarray()
GEOCODE2 = pd.DataFrame(col_transf2, columns=encoder2.categories_[0], dtype=int)
GEOCODE2





In [None]:
encoder3 = OneHotEncoder()

encoder3.fit(cat[['RFA_2A']])
col_transf3 = encoder3.transform(cat[['RFA_2A']]).toarray()

RFA_2A = pd.DataFrame(col_transf3, columns=encoder3.categories_[0], dtype=int)
RFA_2A


In [None]:
encoder4 = OneHotEncoder()

encoder4.fit(cat[['DOMAIN_A']])
col_transf4 = encoder4.transform(cat[['DOMAIN_A']]).toarray()
DOMAIN_A = pd.DataFrame(col_transf4, columns=encoder4.categories_[0], dtype=int)
DOMAIN_A


In [None]:
cat_encod = pd.concat([STATE,HOMEOWNR,GENDER,GEOCODE2,RFA_2A,DOMAIN_A], axis = 1)

In [None]:
cat_encod.shape

In [None]:
cat_encod.head()

In [None]:
cat.drop(['STATE','HOMEOWNR','GENDER','GEOCODE2','RFA_2A','DOMAIN_A', 'RFA_2R'], axis = 1, inplace= True)

In [None]:
cat.head()

In [None]:
cat = pd.concat([cat, cat_encod], axis = 1)

In [None]:
cat.shape

## Train-test split

In [None]:
data = pd.concat([cat, num, target], axis = 1)

In [None]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = X.columns
X_test.columns = X.columns

## Upsampling to correct the class imbalance in TARGET_B

We apply upsampling to the train set. 

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_SMOTE, y_train_SMOTE)

print(X_train_SMOTE.shape)
print(y_train_SMOTE.shape)


In [None]:
y_train.value_counts()

In [None]:
y_train_SMOTE.value_counts()

## Applying the random forest classifier

In [None]:
X_train = pd.DataFrame(X_train_SMOTE)
X_test = pd.DataFrame(X_test)


y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             random_state = 42)
clf.fit(X_train, y_train_SMOTE)

print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train,  y_train_SMOTE)))
print("The accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8)

cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

print("The mean accuracy of the folds was {:.2f}".format(np.mean(cross_val_scores)))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8)

cross_val_scores = cross_val_score(clf, X_train_SMOTE, y_train_SMOTE, cv=10)

print("The mean accuracy of the folds was {:.2f}".format(np.mean(cross_val_scores)))

In [None]:
cross_val_scores

### Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness

We are predicting if a person is donating or not. The most costly mistake is the prediction of a false negative (when the model predicts that a person is not donating and that person would have in fact donated).
The cost of the mistake is the donation that is lost if these persons are not reached by mail. The bigger the lost donation,the higher the cost of the mistake. We could approximate by the mean of the donations (15,6 dollars). Inversely, the cost of a false positive (the model predicts the person will donate when in fact the person does not donate) is the cost of the mailing (0,60 dollars).  
The metric we would like to bring as close to one as possible is the recall metric: True positive/(true positive+false negative).