### Importing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import f_classif, VarianceThreshold, SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('learningSet.csv') # original data
df = pd.read_csv('learningSet.csv') # working data

### Fill null values by criteria

In [None]:
def fill_missing_with_mean(df, column):
    df[column].fillna(np.ceil(np.mean(df[column])), inplace=True)

def fill_missing_with_median(df, column):
    median_value = np.ceil(df[column].median(skipna=True))
    df[column] = df[column].fillna(median_value)
    
def fill_missing_with_value(data, column, value):
    data[column].fillna(value, inplace=True)

# DOMAIN is the sum of the codes from DOMAIN A AND B. (A and B don't have nulls)
df['DOMAIN'] = data['DOMAIN_A'] + data['DOMAIN_B'].astype(str)

# empty SOLIH means that there is no limit for marketing solicitation from the customer
fill_missing_with_value(df, 'SOLIH', 365)

# 22% of income is null and is the leading category. I prefer to add a new label
fill_missing_with_value(df, 'INCOME', 0)

# Fill with the mode
fill_missing_with_value(df, 'GENDER', 'F')

# Fill missing values with the mean for 'DMA', 'CLUSTER2'
for column in ['DMA', 'CLUSTER2']:
    fill_missing_with_mean(df, column)

# Fill 'TIMELAG' and 'NEXTDATE' with the ceiling of the median
for column in ['TIMELAG', 'NEXTDATE']:
    fill_missing_with_median(df, column)
    
# Fill cluster with the mode - 2.5% nulls
fill_missing_with_value(df, 'CLUSTER', '40.0')

# High irrelevancy - indicates with 22% nulls the source of the data
fill_missing_with_value(df, 'DATASRCE', '0')

# home owner flag - 23% nulls
fill_missing_with_value(df, 'HOMEOWNR', 'U')

def map_geocode(x):
    if x not in ['A','B','C','D']:
        return 'O'
    else:
        return x
df['GEOCODE2'] = df['GEOCODE2'].apply(map_geocode)

# For the columns that collect the reactions to magazine when null value is assumed that there is not reaction (no reaction=no information)
columns_to_fill_zero = [
    "MBCRAFT", "MBGARDEN", "MBBOOKS", "MBCOLECT", "MAGFAML", "MAGFEM", "MAGMALE",
    "PUBGARDN", "PUBCULIN", "PUBHLTH", "PUBDOITY", "PUBNEWFN", "PUBPHOTO", "PUBOPP"
]
for column in columns_to_fill_zero:
    df[column] = df[column].fillna(0)


### Dropping by % of null values

In [None]:
def calculate_null_percentages(df, threshold=0.33):
    nulls_percent_df = pd.DataFrame(df.isna().sum() / len(df)).reset_index()
    nulls_percent_df.columns = ['column_name', 'nulls_percentage']
    return nulls_percent_df[nulls_percent_df['nulls_percentage'] > threshold]
drop_columns_list = list(set(calculate_null_percentages(df)['column_name']))


### Dropping by relevance

In [None]:
# OSOURCE - symbol definitions not provided, too many categories 
# ZIP - we are including state already
# MAILCODE is a flag for correctness of the address 98.5% are correct (94013/95412)
# NOEXCH is another flag stating "Do Not Exchange" Flag (For list rental)
# AGEFLAG states if the age is Exact or Inferred from Date of Birth Field
# VETERANS (Y/N)
irrelevant_list = ['OSOURCE', 'ZIP', 'MAILCODE', 'NOEXCH', 'AGEFLAG','VETERANS']
# ADATE = Dates of past campaings
adate_columns = [col for col in df.columns if "ADATE_" in col]

drop_columns_list += irrelevant_list + adate_columns


### Dropping by redundancy

In [None]:
redundant_list = ['WEALTH2','MSA', 'ADI','MAILCODE', 'DOMAIN_A','DOMAIN_B', 'MDMAUD']
mdmaud_columns_rfa_columns = [col for col in data.columns if ("RFA_" in col or "MDMAUD_" in col)]

# RFA_* (with RFA), MDMAUD_* (with MDMAUD). All dropped due to redundancy
# Wealth 2 may be redundant in front of ICX Census data.
# MSA, ADI, DMA are local market designations based on marketing communication. ADI and DMA give similar segmentations (but ranked differently), MSA takes only county clusters above 50,000 inhabitants.
# Domain A and B as they are integrated in DOMAIN
# MDMAUD is in MAJOR

drop_columns_list += redundant_list + mdmaud_columns_rfa_columns

In [None]:
df.drop(columns=drop_columns_list, inplace=True)

### Condensing groups

In [None]:
def map_gender(x):
    return x if x in ['F', 'M'] else 'other'
df['GENDER'] = df['GENDER'].apply(map_gender)

    
state_count = df['STATE'].value_counts().reset_index()
state_count.columns = ['state', 'count']
other_states = state_count[state_count['count'] < 2500]['state'].tolist()

def map_state(x):
    return 'other' if x in other_states else x
df['STATE'] = df['STATE'].apply(map_state)

### Mapping for rebuild categories.

In [None]:
names = ['1', '2', '3', '4', '5']
age_group = df['AGE']
age_group = pd.cut(age_group,bins=5,labels=names, include_lowest=True)
default = 6
age_group= age_group.cat.add_categories([default])
age_group = age_group.fillna(default)
df['AGE'] = age_group

In [None]:
# Count nulls, reset index, and filter columns with nulls
null_count = df.isnull().sum().reset_index().rename(columns={'index': 'column', 0: 'nulls'})
null_count = null_count[null_count['nulls'] > 0]
null_count

### Select data by type: 
### Separate into numerical and categorical

In [None]:
Y = df[['TARGET_B', 'TARGET_D']]
numerical = df.select_dtypes('number').drop(columns = ['TARGET_B', 'TARGET_D'])
categorical = df.select_dtypes('object')
# In this summary the division by data type is done in a second phase
print(numerical.shape)
print(categorical.shape)

### Space values

In [None]:
# Assuming 'df' is your DataFrame
string_to_count = ' '

# Define a function that safely applies the count method to strings
def safe_count(x):
    if isinstance(x, str):
        return x.count(string_to_count)
    else:
        return 0

# Apply the safe_count function to each element of the DataFrame
counts = df.applymap(safe_count).sum()

# Filter the counts to only show columns with a count greater than 0
counts = counts[counts > 0]

# This will give you a Series with the count of the string in each column where the count is greater than 0
print(counts)

# According to further documentation 

# Blank PEPSTRFL is no PEP Star
# Are binary categories eligible for a negative statement sustitution PEPSTRFL, RECINHSE, RECP3, RECPGVG, RECSWEEP, PVASTATE, MAJOR (pd.concat([data['MDMAUD'], df['MAJOR']],axis=1).value_counts())^1, COLLECT1, BIBLE, CATLG, HOMEE, PETS,CDPLAY, STEREO,PCOWNERS,PHOTO,CRAFTS,FISHER,GARDENIN,BOATS,WALKER,KIDSTUFF,CARDS,PLATES
# Non binary categories but still eligible for negative statement sustitution CHILD* (with N), SOLP3 (change with 365), HOMEOWNR but the negative label is already defined 'U',
# ^1 there is an error in MAJOR column description as X is indicating major donors where the document says the opposite, but after comparing with MDMAUD is clear.

In [None]:
# Replace space values with "365" in the "SOLP3" column
categorical['SOLP3'] = categorical['SOLP3'].replace(' ', '365')
# Replace space values with "U" in the "HOMEOWNR" column
categorical['HOMEOWNR'] = categorical['HOMEOWNR'].replace(' ', 'U')

def replace_space_with_N(columns, dataframe):
    for column in columns:
        dataframe[column] = dataframe[column].apply(lambda x: 'N' if x == ' ' else x)
    return dataframe

# List of columns to apply the replacement
columns_to_replace = [
    'PEPSTRFL', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP', 'PVASTATE', 'MAJOR',
    'COLLECT1', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO', 'PCOWNERS',
    'PHOTO', 'CRAFTS', 'FISHER', 'GARDENIN', 'BOATS', 'WALKER', 'KIDSTUFF', 'CARDS', 'PLATES'
]

# Apply the function to the 'categorical' DataFrame
categorical = replace_space_with_N(columns_to_replace, categorical)

# Replace space values with "N" in columns that start with "CHILD"
child_columns = [col for col in categorical.columns if col.startswith('CHILD')]
categorical = replace_space_with_N(child_columns, categorical)

In [None]:
counts_2 = categorical.applymap(safe_count).sum()

# Filter the counts to only show columns with a count greater than 0
counts = counts_2[counts_2 > 0]

# This will give you a Series with the count of the string in each column where the count is greater than 0
print(counts)

### Check distribution

In [None]:
import pandas as pd
import plotly.express as px

for column in numerical.columns:
    fig = px.histogram(numerical, x=column)
    fig.show()

### Split

In [None]:
# Combine features
X = pd.concat([numerical, categorical], axis=1)
y = df['TARGET_B']

X2 = X
y2 = df['TARGET_D']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [None]:
X_train_cat = X_train.select_dtypes(include = object).astype(str)
X_train_num =X_train.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object).astype(str)
X_test_num =X_test.select_dtypes(include = np.number)

### Encode and scale

In [None]:
# Encode categorical features
encoder = OneHotEncoder(drop='first').fit(X_train_cat.select_dtypes(include=object))

cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols)
X_train_cat_encode.reset_index(drop = True, inplace = True)

cols = encoder.get_feature_names_out(input_features=X_test_cat.columns)
X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)
X_test_cat_encode.reset_index(drop = True, inplace = True)

# Scale numerical features
transformer = MinMaxScaler().fit(X_train_num)

X_train_num_norm = transformer.transform(X_train_num)
X_train_num_scale = pd.DataFrame(X_train_num_norm, index = X_train_num.index, columns=X_train_num.columns)
X_train_num_scale.reset_index(drop = True, inplace = True)

X_test_num_norm = transformer.transform(X_test_num)
X_test_num_scale = pd.DataFrame(X_test_num_norm, index = X_test_num.index, columns=X_test_num.columns)
X_test_num_scale.reset_index(drop = True, inplace=True)


# Concatenate X train and test and resume y train and y test
X_train_processed = pd.concat([X_train_num_scale,X_train_cat_encode], axis=1)
y_train.reset_index(drop = True, inplace = True)

X_test_processed = pd.concat([X_test_num_scale,X_test_cat_encode], axis=1)
y_test.reset_index(drop = True, inplace = True)

### Feature selection (Variance Threshold)

In [None]:
# Feature selection based on a variance threshold. Does not consider the relationship between features and the target variable.

var_threshold = 0.13
selector_var = VarianceThreshold(threshold=var_threshold)
X_train_var = selector_var.fit_transform(X_train_processed)
X_test_var = selector_var.transform(X_test_processed)

indices_var = selector_var.get_support(indices=True)
len(indices_var)

In [None]:
# selector_var.variances_ > var_threshold
# drop_list = [col[0] for col in zip(X_train.columns,var_list) if col[1] == False]

### Feature selection (SelectKBest)

In [None]:
# Select features that have the strongest relationship with the target variable, based on the chi-squared statistic. Only positive values, and the best for categorical. 

selector_chi2 = SelectKBest(chi2, k=20)
X_train_chi2 = selector_chi2.fit_transform(X_train_processed, y_train)
X_test_chi2 = selector_chi2.transform(X_test_processed)

indices_chi2 = selector_chi2.get_support(indices=True)
len(indices_chi2)

### Feature selection (ANOVA F-test)

In [None]:
# Selection based on univariate statistical tests. May not capture non-linear relationships as effectively. Good for numerical and categorical

selector_f = SelectKBest(f_classif, k=20)
X_train_f_test = selector_f.fit_transform(X_train_processed, y_train)
X_test_f_test = selector_f.transform(X_test_processed)

indices_f = selector_f.get_support(indices=True)
len(indices_f)

### Feature selection (Random Forest)

In [None]:
# An ensemble method that uses feature importance scores from the Random Forest classifier to select features. Computational intensive. Good for numerical and categorical.Can capture both linear and non-linear relationships; provides feature importance scores.

forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train_processed, y_train)
importances = forest.feature_importances_


indices_rf = np.argsort(importances)[::-1][:20]  # Top 20 features

### Feature comparison table

In [None]:
# Find the maximum length of the lists
max_length = max(len(indices_var), len(indices_chi2), len(indices_f), len(indices_rf))

# Function to pad lists to the maximum length
def pad_list(lst, length, pad_value):
    return lst + [pad_value] * (length - len(lst))

# Pad the lists
indices_var_padded = pad_list(list(X_train_processed.columns[indices_var]), max_length, None)
indices_chi2_padded = pad_list(list(X_train_processed.columns[indices_chi2]), max_length, None)
indices_f_padded = pad_list(list(X_train_processed.columns[indices_f]), max_length, None)
indices_rf_padded = pad_list(list(X_train_processed.columns[indices_rf]), max_length, None)

# Sort each column individually
sorted_var = sorted([x for x in indices_var_padded if x is not None])
sorted_chi2 = sorted([x for x in indices_chi2_padded if x is not None])
sorted_f = sorted([x for x in indices_f_padded if x is not None])
sorted_rf = sorted([x for x in indices_rf_padded if x is not None])

# Pad the sorted lists to the maximum length
sorted_var_padded = pad_list(sorted_var, max_length, None)
sorted_chi2_padded = pad_list(sorted_chi2, max_length, None)
sorted_f_padded = pad_list(sorted_f, max_length, None)
sorted_rf_padded = pad_list(sorted_rf, max_length, None)

# Now create the DataFrame with sorted columns
sorted_comparison_df = pd.DataFrame({
    'Variance Threshold': sorted_var_padded,
    'SelectKBest chi2': sorted_chi2_padded,
    'ANOVA F-test': sorted_f_padded,
    'Random Forest': sorted_rf_padded
})

print(sorted_comparison_df)


### Looking at multicolinearity

In [None]:
# Create the X_train datasets based on selected features
X_train_var = X_train_processed.iloc[:, indices_var]
X_train_rf = X_train_processed.iloc[:, indices_rf]
X_train_f = X_train_processed.iloc[:, indices_f]
X_train_chi2 = X_train_processed.iloc[:, indices_chi2]

X_test_var = X_test_processed.iloc[:, indices_var]
X_test_rf = X_test_processed.iloc[:, indices_rf]
X_test_f = X_test_processed.iloc[:, indices_f]
X_test_chi2 = X_test_processed.iloc[:, indices_chi2]

# Define a function to compute and plot the correlation matrix
def plot_correlation_matrix(X, title):
    corr_matrix = pd.concat([X,y_train], axis=1).corr(method='pearson')
    plt.figure(figsize=(20, 15))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title(title)
    plt.show()

# Plot the correlation matrix for each X_train dataset
plot_correlation_matrix(X_train_var, 'Variance Threshold Feature Correlation Matrix')
plot_correlation_matrix(X_train_rf, 'Random Forest Feature Correlation Matrix')
plot_correlation_matrix(X_train_f, 'ANOVA F-test Feature Correlation Matrix')
plot_correlation_matrix(X_train_chi2, 'SelectKBest chi2 Feature Correlation Matrix')


### Drop highest multi-collinearity.

In [None]:
# High correlation between features in the following cases:
# X_train_var: POP90C1, HVP4
# X_train_rf: MINRDATE, FISTDATE, CARDPROM, POP903, POP902, NGIFTALL
# X_train_f: HV2, HVP4, CARDPROM, FISTDATE,FISTDATE_YR 
# X_train_chi2: HV2, HV3, HV4, HV6

In [None]:
def drop_columns(df, columns_to_drop):
    df = df.drop(columns_to_drop, axis=1, errors='ignore')
    return df

# Drop the specified columns from X_train_var and X_test_var
X_train_var = drop_columns(X_train_var, ['POP90C1', 'HVP4'])
X_test_var = drop_columns(X_test_var, ['POP90C1', 'HVP4'])

# Drop the specified columns from X_train_rf and X_test_rf
X_train_rf = drop_columns(X_train_rf, ['MINRDATE', 'FISTDATE', 'CARDPROM', 'POP903', 'POP902', 'NGIFTALL'])
X_test_rf = drop_columns(X_test_rf, ['MINRDATE', 'FISTDATE', 'CARDPROM', 'POP903', 'POP902', 'NGIFTALL'])

# Drop the specified columns from X_train_f and X_test_f
X_train_f = drop_columns(X_train_f, ['HV2', 'HVP4', 'CARDPROM', 'FISTDATE', 'FISTDATE_YR'])
X_test_f = drop_columns(X_test_f, ['HV2', 'HVP4', 'CARDPROM', 'FISTDATE', 'FISTDATE_YR'])

# Drop the specified columns from X_train_chi2 and X_test_chi2
X_train_chi2 = drop_columns(X_train_chi2, ['HV2', 'HV3', 'HV4', 'HV6'])
X_test_chi2 = drop_columns(X_test_chi2, ['HV2', 'HV3', 'HV4', 'HV6'])


## Pre-Model

In [None]:
import time
%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def evaluate_model(X_train, X_test, y_train, y_test, model_name):
    clf = RandomForestClassifier(
    n_estimators=100,      # More trees can lead to better performance but also to longer training times
    max_depth=20,          # Deeper trees capturing more complex patterns. Be cautious, as too much depth can lead to overfitting
    min_samples_split=2,   # Minimum number of samples required to split an internal node. Lower values allow the algorithm to create more specific splits, thus increasing complexity.
    min_samples_leaf=1,    # Minimum number of samples required to be at a leaf node. Smaller leaf sizes result in more complex decision paths
    )
    clf.fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    test_accuracy = clf.score(X_test, y_test)
    print(f'{model_name} - TrainSet Accuracy: ', train_accuracy)
    print(f'{model_name} - TestSet Accuracy: ', test_accuracy)
    
    # Compute confusion matrix
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    # Plotting the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

    return train_accuracy, test_accuracy

# Example usage:
evaluate_model(X_train_var, X_test_var, y_train, y_test, 'X_train_var')
evaluate_model(X_train_rf, X_test_rf, y_train, y_test, 'X_train_rf')
evaluate_model(X_train_f, X_test_f, y_train, y_test, 'X_train_f')
evaluate_model(X_train_chi2, X_test_chi2, y_train, y_test, 'X_train_chi2')

### Check and fix Target imbalance

In [None]:
print(y_train.value_counts())
target_d_1 = len(y2_train[y2_train>0])
target_d_0 = len(y2_train)
print('TARGET_D\n', target_d_0, '\n', target_d_1)

In [None]:
from sklearn.utils import resample

df = pd.concat([X_train_processed,X_test_processed], axis=1)
df_b = pd.concat([df,y],axis=1)
df_d = pd.concat([df,y2],axis=1)

# Separate the majority and minority classes
df_b_majority = df_b[df_b.TARGET_B == 0]
df_b_minority = df_b[df_b.TARGET_B == 1]

df_d_majority = df_d[df_d.TARGET_D == 0]
df_d_minority = df_d[df_d.TARGET_D > 0]


# Upsample minority class
df_b_minority_upsampled = resample(df_b_minority, 
                                 replace=True,
                                 n_samples=len(df_b_majority),
                                 random_state=101)

df_d_minority_upsampled = resample(df_d_minority, 
                                 replace=True,
                                 n_samples=len(df_d_majority),
                                 random_state=101)


# Combine majority class with upsampled minority class
df_b_upsampled = pd.concat([df_b_majority, df_b_minority_upsampled])
df_d_upsampled = pd.concat([df_d_majority, df_d_minority_upsampled])

# Display new class counts
print(df_b_upsampled.TARGET_B.value_counts())
print(df_d_upsampled.TARGET_D.value_counts())

# X and Y for TARGET B
X_upsampled = df_b_upsampled.drop('TARGET_B', axis=1)
y_upsampled = df_b_upsampled.TARGET_B

# X and Y for TARGET B
X2_upsampled = df_d_upsampled.drop('TARGET_D', axis=1)
y2_upsampled = df_d_upsampled.TARGET_D


## Model

In [None]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train_oversampled, y_train_oversampled)
print('TrainSet = ',clf.score(X_train_oversampled, y_train_oversampled))
print('TestSet = ',clf.score(X_test, y_test))

In [None]:

pred_RF = clf2.predict(X_test)

print('accuracy:', accuracy_score(y_test, pred_RF))
print("precision: ",precision_score(y_test,pred_RF))
print("recall: ",recall_score(y_test,pred_RF))
print("f1: ",f1_score(y_test,pred_RF))

### Check the confusion matrix

In [None]:
plot_confusion_matrix(clf2, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

# Now we have a model, let's run ALL of the data to gain insights for the business

In [None]:
X.dtypes

In [None]:
display(X)
display(y)

## Scale and encode ALL of the data for model

In [None]:
Xcat = X.select_dtypes(include=object)
Xnum = X.select_dtypes(include=np.number)
Xcat

In [None]:
#encoder = OneHotEncoder(drop='first').fit(Xcat)

cols = encoder.get_feature_names(input_features=Xcat.columns)

Xcat_encode = pd.DataFrame(encoder.transform(Xcat).toarray(),columns=cols)

Xcat_encode.reset_index(drop = True, inplace = True)
Xcat_encode

In [None]:
#transformer = MinMaxScaler().fit(Xnum)
Xnum_norm = transformer.transform(Xnum)
print(Xnum_norm.shape)
Xnum_scale = pd.DataFrame(Xnum_norm, index = Xnum.index, columns=Xnum.columns)
Xnum_scale.head()
Xnum_scale.reset_index(drop = True, inplace = True)

In [None]:
X_tarb = pd.concat([Xnum_scale,Xcat_encode], axis = 1)

In [None]:
X_tarb

# Keep same columns as before (top 25)

In [None]:
X2 = X_tarb[col_list]           #columns to keep -- top 25
X2.shape

In [None]:
X2 = X2.drop(col_to_drop, axis = 1)
X2.shape

## Re-running Classifier with all the data

### Run the classifier on all of the data, then look at the confusion matrix

In [None]:
##Using the upsampled classifier

print('All data = ',clf2.score(X2,y))


In [None]:
plot_confusion_matrix(clf2, X2,y,cmap=plt.cm.Blues)  
plt.show()

In [None]:
pred_RF = clf2.predict(X2)

print('accuracy:', accuracy_score(y, pred_RF))
print("precision: ",precision_score(y,pred_RF))
print("recall: ",recall_score(y,pred_RF))
print("f1: ",f1_score(y,pred_RF))

### All data with target B predictions

In [None]:
all_pred_tarB = X
all_pred_tarB['predicted_B']= pred_RF
all_pred_tarB['target_D'] = target_d
all_pred_tarB

## Finding mean of Target D to understand data Target B results

#### We will use data_targetD for part 2 (regression) of the process

In [None]:
data_targetD = full_data[full_data['TARGET_B']==1]
data_targetD

In [None]:
len(data_targetD['TARGET_D'])

In [None]:
data_targetD['TARGET_D'].value_counts()

In [None]:
data_targetD['TARGET_D'].mean()

# Reading the results

In [None]:
plot_confusion_matrix(clf2, X2,y,cmap=plt.cm.Blues)  
plt.show()

In [None]:
predict_y = clf2.predict(X2)


array = confusion_matrix(y, predict_y)
Q1 = array[0][0]
Q2 = array[0][1]
Q3 = array[1][0]
Q4 = array[1][1]
print(Q1)
print(Q2)
print(Q3)
print(Q4)

In [None]:
donation_gained = Q4 * 15.6
unexpected_donation = Q3 * 15.6
spend_marketing = (Q4+Q2) * 0.68
no_return_marketing = Q2 * 0.68
overall_revenue = (donation_gained + unexpected_donation) - spend_marketing



print('Donation amount gained:',round(donation_gained,2))
print('Unexpected Donations:',round(unexpected_donation,2))
print('Amount spent (cost) on marketing:',round(spend_marketing,2))
print('Amount lost on marketing:',round(no_return_marketing,2))

print('Overall revenue (donation - cost):',round(overall_revenue,2))

In [None]:
print('In the above plot we can see that there are    ',Q3,'    people of whom the model predicts they will not donate,')
print('when they actually would donate, AKA falsely predicted non-donators.')
print('We gain a donation amount of    ',unexpected_donation,'    from this group.')
print('There are also   ', Q2 ,'    people of whom the model predicts they will donate, when they actually do not.')
print('We  lose   ', no_return_marketing,'   on mailing costs from this group.')
print('When we look at the average donation amount of someone who actually donates is around 15.62 dollars.')
print('We raise   ', overall_revenue ,'   dollars with this model.')

## Results of the model 


Zooming in on the costs to send the marketing packages: in this scenario we spend a total of 26158 dollars on marketing. Of which 24208 dollars does not yield any donation. It is suggested that mailers be sent to our predicted donors as well as people who have donated in the past.

We can clearly conclude that it's in the best interest of Healthcare 4 All to have a model in which we have the lowest amount of falsely predicted non-donators. With this model we can see that the donation amount lost is almost equal to the Unexpected Donations gained.This model would need to be improved to optimize for the Falsely Predicted non- donators who would actually donate.

# Moving on to create model to predict HOW MUCH they will donate.

# Predicting amount given among actual donators

#### Using target D dataset created earlier

In [None]:
data_targetD

#### Breaking into features and target

In [None]:
X_D = data_targetD.drop(['TARGET_D', 'TARGET_B', 'Unnamed: 0'], axis=1)
y_D = data_targetD['TARGET_D']
display(X_D)
display(y_D)

#### Train test split

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_D, y_D, test_size=0.2, random_state=42)

In [None]:
X_train_cat1 = X_train1.select_dtypes(include = object)
X_train_num1 =X_train1.select_dtypes(include = np.number)
X_test_cat1 = X_test1.select_dtypes(include = object)
X_test_num1 =X_test1.select_dtypes(include = np.number)

## Encoding

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat1)

cols1 = encoder.get_feature_names(input_features=X_train_cat1.columns)

X_train_cat_encode1 = pd.DataFrame(encoder.transform(X_train_cat1).toarray(),columns=cols1)

X_train_cat_encode1.reset_index(drop = True, inplace = True)
X_train_cat_encode1

In [None]:
cols = encoder.get_feature_names(input_features=X_test_cat1.columns)

X_test_cat_encode1 = pd.DataFrame(encoder.transform(X_test_cat1).toarray(),columns=cols)

X_test_cat_encode1.reset_index(drop = True, inplace = True)
X_test_cat_encode1

## Scaling

In [None]:
transformer = MinMaxScaler().fit(X_train_num1)
X_train_num_norm1 = transformer.transform(X_train_num1)
print(X_train_num_norm1.shape)
X_train_num_scale1 = pd.DataFrame(X_train_num_norm1, index = X_train_num1.index, columns=X_train_num1.columns)
X_train_num_scale1.head()
X_train_num_scale1.reset_index(drop = True, inplace = True)

In [None]:
X_test_num_norm1 = transformer.transform(X_test_num1)
print(X_test_num_norm1.shape)
X_test_num_scale1 = pd.DataFrame(X_test_num_norm1, index = X_test_num1.index, columns=X_test_num1.columns)
X_test_num_scale1.head()
X_test_num_scale1.reset_index(drop = True, inplace=True)
X_test_num_scale1

In [None]:
X_train1 = pd.concat([X_train_num_scale1,X_train_cat_encode1], axis=1)
X_train1.index
y_train1.reset_index(drop = True, inplace = True)

In [None]:
X_test1 = pd.concat([X_test_num_scale1,X_test_cat_encode1], axis=1)
X_test1
y_test1.reset_index(drop = True, inplace = True)

# Feature Selection

#### Variance - Not Used Code included

In [None]:
# from sklearn.feature_selection import VarianceThreshold
# Var_threshold = 0.02
# sel = VarianceThreshold(threshold=Var_threshold)
                        
# sel = sel.fit(X_train1)
# temp = sel.transform(X_train1)
# temp = pd.DataFrame(temp)
# print(X_train1.shape)
# print(temp.shape)
                        

In [None]:
# sel.variances_ > Var_threshold
# sel.get_support()
# var_list = list(sel.get_support())
# var_list
# print(var_list.count(True))
# print(var_list.count(False))

In [None]:
# zipped = list(zip(X_train.columns,var_list))
# len(zipped)

In [None]:
# drop_list = [col[0] for col in zip(X_train1.columns,var_list) if col[1] == False]
# print(drop_list)
# drop_list = drop_list +['HVP1','HVP2','HVP3','HVP5','HVP6','HV2']
# len(drop_list)

In [None]:
#drop_list

In [None]:
# X_train1 = X_train1.drop(drop_list, axis = 1)
# X_train1

In [None]:
# X_test1 = X_test1.drop(drop_list, axis = 1)
# X_test1

# PCA - Principle Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(0.9)
pca.fit(X_train1)

X_train_pca = pca.transform(X_train1)
X_test_pca = pca.transform(X_test1)

corr_pc = pd.DataFrame(X_train_pca).corr()


In [None]:
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)


# Create the visualization plot
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, 
        alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, 
         where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
print(X_train_pca.shape)
print(X_test_pca.shape)
print (y_train1.shape)
print(y_test1.shape)

## RFE option, not used.

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeClassifier
# # define the method
# rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=30)
# # fit the model
# rfe.fit(X_train, y_train)
# # transform the data
# X, y = rfe.transform(X_train, y_train)

# Run regression models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


In [None]:
def models_automation(models, X_train1, y_train1,X_test1,y_test1):
    for model in models:
        model.fit(X_train1, y_train1)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train1, y_train1)}, Test -> {model.score(X_test1, y_test1)}")


In [None]:
model_list = [LinearRegression(),SGDRegressor(),KNeighborsRegressor(), MLPRegressor(),DecisionTreeRegressor(),RandomForestRegressor()]
models_automation(model_list, X_train_pca, y_train1,X_test_pca,y_test1)

### Optimize parameters for models

### Opted to use the non-PCA train

In [None]:
model_list = [LinearRegression(),SGDRegressor(),KNeighborsRegressor(n_neighbors =9), MLPRegressor(solver = 'lbfgs', max_iter = 1500),DecisionTreeRegressor(criterion="poisson"),
              RandomForestRegressor(min_samples_split = 7,
                                    min_samples_leaf = 6,
                                    max_samples = 0.4,
                                    max_depth = 14,
                                    n_jobs = -1)]
models_automation(model_list, X_train1, y_train1,X_test1,y_test1)

In [None]:
# Linear Regression / Random Forest ??

# Predict how much our predicted Donors will give.

#### Create dataframe of ONLY predicted donors

In [None]:
pred_donors = all_pred_tarB[all_pred_tarB['predicted_B']==1]
pred_donors

In [None]:
pred_B_X=pred_donors.drop(['predicted_B','target_D'], axis=1)
pred_B_y=pred_donors['target_D']

#### Split for scaling and encoding

In [None]:
pred_B_X_cat = pred_B_X.select_dtypes(include = object)
pred_B_X_num =pred_B_X.select_dtypes(include = np.number)

#### Scale - using existing scaler

In [None]:
pred_B_X_num_norm1 = transformer.transform(pred_B_X_num)
print(pred_B_X_num_norm1.shape)
pred_B_X_num_scale1 = pd.DataFrame(pred_B_X_num_norm1, index = pred_B_X_num.index, columns=pred_B_X_num.columns)
pred_B_X_num_scale1.head()
pred_B_X_num_scale1.reset_index(drop = True, inplace = True)

#### Encode - using existing encoder

In [None]:
cols = encoder.get_feature_names(input_features=pred_B_X_cat.columns)

pred_B_X_cat_encode1 = pd.DataFrame(encoder.transform(pred_B_X_cat).toarray(),columns=cols)

pred_B_X_cat_encode1.reset_index(drop = True, inplace = True)
pred_B_X_cat_encode1

#### Concatenate together for all Predicted X

In [None]:
pred_B_X = pd.concat([pred_B_X_num_scale1,pred_B_X_cat_encode1], axis=1)
pred_B_X.index
pred_B_y.reset_index(drop = True, inplace = True)

## Run same PCA Transformer - Not used but here to try

In [None]:
# pred_B_X_pca = pca.transform(pred_B_X)
# pred_B_X_pca.shape


### Run through model

#### First a Random Forest

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
rfr= RandomForestRegressor(min_samples_split = 7,
                                    min_samples_leaf = 6,
                                    max_samples = 0.4,
                                    max_depth = 14,
                                    n_jobs = -1).fit(X_train1, y_train1)

In [None]:
rfr_pred = rfr.predict(pred_B_X)
rfr_pred.shape

In [None]:
RMSE_rfr = np.sqrt(mean_squared_error(pred_B_y, rfr_pred))
RMSE_rfr

#### Second a Linear Regression

In [None]:
lm = LinearRegression().fit(X_train1, y_train1)

In [None]:
lm_pred = lm.predict(pred_B_X)

In [None]:
RMSE_lm = np.sqrt(mean_squared_error(pred_B_y, lm_pred))
RMSE_lm

## Considering the mean was 15.6 dollars.  Our RMSE look pretty off.  Let's check why.

## Looking at actual and predictions

In [None]:
compare_df = pd.DataFrame({'Actual Values':pred_B_y, 'RandForst Predicted Values':rfr_pred, 'LinRegress Predicted Values':lm_pred})

In [None]:
compare_df


In [None]:
compare_df[compare_df['Actual Values']!=0.0]

In [None]:
round(compare_df['RandForst Predicted Values'].sum(),2)

In [None]:
round(compare_df['Actual Values'].sum(),2)