In [None]:
# assign data URL
data_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQzBYWDif8AqH47QpdaMsxZ0d3aXafgvL6EfnsUk6iN5QPCgrhvEky7hzI16iyfL3L2rfec3QX32JQj/pub?gid=0&single=true&output=csv'
data_url

In [27]:
import pandas
import numpy as np

In [None]:
data = pandas.read_csv(data_url)

# Function to convert values with $ to integers
def convert_currency_to_int(value):
    if isinstance(value, str) and '$' in value:
        cleaned_value = value.replace("$","").replace(",","").strip()
        try:
            return int(cleaned_value)
        except ValueError:
            return value  # Handle the case where the value can't be converted to an integer
    else:
        return value

# Apply the conversion function to all elements in the DataFrame
data = data.applymap(convert_currency_to_int)

#display data
data

#display column names
data.columns

#list data types
data.info()

#generate descriptive stats for data
data.describe()

In [29]:
# Setting the noise/redundant/missing data variables
ignored_cols  = ['DonorUniqueId', 'PreferredAddressType', 'DonorPostalCode', 'DonorDateOfBirth', 'CumulativeDonationAmount']

In [30]:
# Setting categorical cols
cat_cols = ['GenderIdentity', 'IsMemberFlag', 'IsAlumnusFlag', 'IsParentFlag', 'HasInvolvementFlag', 'HasEmailFlag', 'MaritalStatus', 'WealthRating', 'AcademicDegreeLevel']

In [31]:
## Setting the numerical cols
num_cols = ['DonorAge', 'ConsecutiveDonorYears',
       'LastFiscalYearDonation', 'Donation2FiscalYearsAgo', 'Donation3FiscalYearsAgo',
       'Donation4FiscalYearsAgo', 'Donation5FiscalYearsAgo', 'CurrentFiscalYearDonation']

In [38]:
# Setting the dependent/target/label variable
y = 'DonorIndicatorFlag.'

In [None]:
# Installing Pycaret for Automated ML
!pip install -U --pre pycaret

In [None]:
# Installing sweetviz package for visualization and statistical analysis
!pip install sweetviz

In [13]:
# Importing the package
import sweetviz as sv

In [14]:
# Creating a variables with predictor columns
pred_cols  = ['DonorAge', 'ConsecutiveDonorYears',
       'LastFiscalYearDonation', 'Donation2FiscalYearsAgo', 'Donation3FiscalYearsAgo',
       'Donation4FiscalYearsAgo', 'Donation5FiscalYearsAgo', 'CurrentFiscalYearDonation',
       'GenderIdentity', 'IsMemberFlag', 'IsAlumnusFlag', 'IsParentFlag', 'HasInvolvementFlag',
       'HasEmailFlag', 'MaritalStatus', 'WealthRating', 'AcademicDegreeLevel', 'DonorIndicatorFlag.',
       'CumulativeDonationAmount']

In [None]:
# Generating a dataset with only predictor columns
pred_data = data[pred_cols]

# Displaying the predictors dataset
pred_data

In [None]:
# Generating EDA report using sweetviz
sv_report = sv.analyze(pred_data)

In [None]:
# Generate a html formatted report
sv_report.show_html("Red Cross Report.html")

In [None]:
# Installing dataprep package for data profiling
!pip install dataprep

# Importing dataprep package for data profiling
from dataprep.eda import create_report

# Generating the data profiling report
create_report(data).save()

In [None]:
# Importing the pycaret and setting the experiment
from pycaret.classification import *

classification_model_setup = setup(data,
                                   target  = y,
                                   ignore_features=ignored_cols,
                                   numeric_features=num_cols,
                                   categorical_features=cat_cols,
                                   train_size = 0.7)

In [39]:
# Invoking ML algorithms
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.697,0.7569,0.697,0.821,0.6927,0.4402,0.5223,2.211
ada,Ada Boost Classifier,0.696,0.7553,0.696,0.8185,0.6918,0.4379,0.5187,1.034
lr,Logistic Regression,0.6915,0.7572,0.6915,0.7812,0.6911,0.4187,0.4734,1.886
lightgbm,Light Gradient Boosting Machine,0.689,0.7562,0.689,0.7687,0.6898,0.4099,0.4566,1.412
xgboost,Extreme Gradient Boosting,0.6887,0.7561,0.6887,0.7638,0.6899,0.4071,0.4503,0.743
qda,Quadratic Discriminant Analysis,0.6856,0.7549,0.6856,0.8234,0.679,0.4237,0.5144,0.709
dt,Decision Tree Classifier,0.6829,0.7374,0.6829,0.7299,0.6868,0.3799,0.4024,0.485
rf,Random Forest Classifier,0.6811,0.7554,0.6811,0.7202,0.6855,0.3705,0.3875,2.171
et,Extra Trees Classifier,0.6773,0.7348,0.6773,0.7247,0.6812,0.3693,0.3915,2.917
svm,SVM - Linear Kernel,0.6583,0.0,0.6583,0.6511,0.5946,0.2313,0.2753,1.163


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
#Using the best performing model
model = create_model('gbc')

In [34]:
# Importing the client dataset
client_data = pandas.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQfRxnJl2drvo5fEUs8toEpIiniUlvr91eVYumIkCOKTewYMQ8f2WFY6XzKZyad51fOPZpxPcu9AOhf/pub?gid=420791075&single=true&output=csv')
client_data.shape

(6270, 22)

In [35]:
# Predicting standard scores using the best model
client_data = client_data.applymap(convert_currency_to_int)
client_data_predictions = predict_model(model, client_data)

In [None]:
client_data_predictions

In [37]:
client_data_predictions.to_csv("Red Cross -ML Algo.csv")