## Banking Campaign

Dataset Source: https://www.kaggle.com/datasets/prakharrathi25/banking-dataset-marketing-targets?select=train.csv

##### Import Necessary Libraries

In [0]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, DoubleType

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

##### Versions of Libraries, Modules, Frameworks Used in This Project

In [0]:
print("Apache Spark version:", spark.version)

Apache Spark version: 3.3.0


#### Create Functions Used Throughout This Project

##### Create Function to Ingest Dataset

In [0]:
def ingest_data_csv(file_location: str, 
                   schema: StructType, \
                   delimiter: str = ',' \
                  ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function returns a dataset aligned with the schema
    '''
    file_type = "csv"
    infer_schema = "false"
    first_row_is_header = "true"
    
    dataset = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .schema(schema)\
      .load(file_location)
    
    return dataset

##### Ingest Multiple Data Files as One Dataset (Which Calls 'ingest_data_csv' Function for Each Data File)

In [0]:
def ingest_multiple_datasets(data_files: [str], \
                             schema: StructType, \
                             delimiter: str = ',' \
                            ) -> pyspark.sql.dataframe.DataFrame:
    '''
    With calls to the 'ingest_data_csv' function, this function 
    returns multiple datasets concatenated.
    '''
    temp_df =  spark.createDataFrame([], schema)
    dataset =  spark.createDataFrame([], schema)
    
    for x in data_files:
        temp_df = ingest_data_csv(x, schema, delimiter)
        dataset = dataset.union(temp_df)
        
    return dataset

##### Function to Remove Class Imbalance

In [0]:
def balance_dataset(dataset: pyspark.sql.dataframe.DataFrame, \
                    unique_label_values: [str], \
                    new_schema: StructType, \
                    samples: int = 20000, 
                    label_col: str = "label" \
                   ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This functions seeks to balance the number of samples 
    in each class to the integer value passed into this
    function.
    '''
    temp_df =  spark.createDataFrame([], new_schema)
    new_df =  spark.createDataFrame([], new_schema)
    
    for ulab in unique_label_values:
        # extract df of only the desired labels
        temp_df = dataset.where(F.col(label_col).isin(ulab))
        
        ratio = round(samples/temp_df.count(), 4)
        # sample it to desired number of samples
        if ratio > 1.0:
            # Oversample
            temp_df = temp_df.sample(True, ratio, seed=42)
        elif ratio < 1.0:
            # Undersample
            temp_df = temp_df.sample(False, ratio, seed=42)
        elif ratio == 1.0:
            # Just use the existing dataframe
            pass
        # concatenate it to the new_df
        new_df = new_df.union(temp_df)
    
    return new_df

##### Function to Find Lower & Upper Bound for Outlier Removal

In [0]:
def outlier_detector(dataset: pyspark.sql.dataframe.DataFrame, \
                     col: str \
                    ) -> []:
    '''
    This function returns the upper & lower bound of the
    dataframe using the 1.5 * IQR Rule.
    '''
    q1, q3 = dataset.approxQuantile(col, [0.25, 0.75], 0)
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    
    return [lower_bound, upper_bound]

##### Scaler Decision Function

In [0]:
def scaler_decider(features_column: str = "features", \
                   scaler: str = 'None' \
                  ) -> {}:
    '''
    This function builds the scaler that is dictated to it via a 
    parameter & returns the updates stages list.
    '''
    from pyspark.ml.feature import MinMaxScaler, RobustScaler, MaxAbsScaler, StandardScaler
    
    scaler_stage = []
    output_col = "features"
    if scaler == 'min_max':
        scaler = MinMaxScaler(inputCol=features_column, outputCol="scaled_features")
        output_col="scaled_features"
        scaler_stage = [scaler]
    elif scaler == 'max_abs':
        scaler = MaxAbsScaler(inputCol=features_column, outputCol="scaled_features")
        output_col="scaled_features"
        scaler_stage = [scaler]
    elif scaler == 'standard':
        scaler = StandardScaler(inputCol=features_column, outputCol="scaled_features", withStd=True, withMean=True)
        output_col="scaled_features"
        scaler_stage = [scaler]
    return {"stage" : scaler_stage, "output_column" : output_col}

##### Function to Prepare Dataset for Analysis (Pipeline Stages, Pipeline, Fit, & Transform)

In [0]:
def prepare_data(dataset: pyspark.sql.dataframe.DataFrame, \
                         categorical_columns: [str], \
                         numerical_columns: [str], \
                         scaler: str = "None", \
                         label = "label" \
                ) -> {}:
    '''
    This function creates the pipeline stages for classification analysis.
    '''
    from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
    
    # Define variables used within function
    stages = []
        
    # Prepare categorical features
    for cat in categorical_columns:
        stringIndexer = StringIndexer(inputCol = cat, outputCol = cat + "_index")
        encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],\
                                outputCols=[cat + "_class_vec"])
        stages += [stringIndexer, encoder]
    
    # Use the vector assembler
    assembler_inputs = [c + "_class_vec" for c in categorical_columns] + numerical_columns
    assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
    stages += [assembler]
    
    # Now run the (numerical) scaler
    if scaler != 'None':
        feature_scaler = scaler_decider("features", scaler = scaler)
        scaler_stage = feature_scaler["stage"]
        stages += scaler_stage
    
    data_pipe = Pipeline().setStages(stages)
    data_fitted = data_pipe.fit(dataset)
    prepped_ds = data_fitted.transform(dataset)
    return prepped_ds

##### Create Metrics Evaluation Function

In [0]:
def mc_evaluate_with_spark_metrics(dataset: pyspark.sql.dataframe.DataFrame, \
                                metrics: [str], \
                                model_name: str, \
                                label_col: str = "label", \
                                predictionCol: str = "prediction" \
                               ) -> None:
    '''
    Calculate & display metrics for a multiclass classification analysis.
    '''
    print("+---------------------------------------------+")
    print("|  " + model_name.center(41) + "  |")
    print("+---------------------------------------------+")
    print("|   %s  |  %s   |" % ("Metric".rjust(20), "Value".ljust(14)))
    print("+---------------------------------------------+")
    for x in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol=label_col, \
                                                      predictionCol=predictionCol, \
                                                      metricName=x) 
        score = evaluator.evaluate(dataset)
        print("|   %s  |  %s   |" % (x.rjust(20), str(round(score, 6)).ljust(14)))
        print("+---------------------------------------------+")

#### Ingest & Preprocess Dataset

##### Ingest Dataset

In [0]:
orig_schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("marital", StringType(), True),
    StructField("education", StringType(), True),
    StructField("default", StringType(), True),
    StructField("balance", IntegerType(), True),
    StructField("housing", StringType(), True),
    StructField("loan", StringType(), True),
    StructField("contact", StringType(), True),
    StructField("day", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("duration", IntegerType(), True),
    StructField("campaign", IntegerType(), True),
    StructField("pdays", IntegerType(), True),
    StructField("previous", IntegerType(), True),
    StructField("poutcome", StringType(), True),
    StructField("label", StringType(), True),
])

data_files = ["/FileStore/tables/banking_dataset/train.csv",
                "/FileStore/tables/banking_dataset/test.csv"]

df = ingest_multiple_datasets(data_files,
                             orig_schema,
                             delimiter = ';')

df = df.drop("pdays")

display(df)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,label
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,no
35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,0,unknown,no
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,0,unknown,no
42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,0,unknown,no
58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,0,unknown,no
43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,0,unknown,no


Output can only be rendered in Databricks

##### Return Some Summary Statistics about Dataset

In [0]:
first_half = ["age", "job", "marital", "education", "default", "balance", "housing", "loan"]

second_half = [x for x in df.columns if x not in first_half]

df[first_half].summary().show()
df[second_half].summary().show()

+-------+-----------------+-------+--------+---------+-------+------------------+-------+-----+
|summary|              age|    job| marital|education|default|           balance|housing| loan|
+-------+-----------------+-------+--------+---------+-------+------------------+-------+-----+
|  count|            49732|  49732|   49732|    49732|  49732|             49732|  49732|49732|
|   mean|40.95747205018901|   null|    null|     null|   null|1367.7615619721707|   null| null|
| stddev|10.61500781018469|   null|    null|     null|   null| 3041.608765766552|   null| null|
|    min|               18| admin.|divorced|  primary|     no|             -8019|     no|   no|
|    25%|               33|   null|    null|     null|   null|                72|   null| null|
|    50%|               39|   null|    null|     null|   null|               448|   null| null|
|    75%|               48|   null|    null|     null|   null|              1430|   null| null|
|    max|               95|unknown|  sin

##### Count of Null Values in Each Feature

In [0]:
df.select([F.count(F.when(F.isnan(n) | F.col(n).isNull(), n)).alias(n) for n in df.columns]).show()

+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+--------+--------+-----+
|age|job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|previous|poutcome|label|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+--------+--------+-----+
|  0|  0|      0|        0|      0|      0|      0|   0|      0|  0|    0|       0|       0|       0|       0|    0|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+--------+--------+-----+



##### Separate Features By Data Types for Pipeline

In [0]:
numerical_features = ['age', 'balance', 'duration']

categorical_features = [x for x in df.columns if x not in numerical_features and x != "label"]

print(numerical_features)
print(categorical_features)

['age', 'balance', 'duration']
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'previous', 'poutcome']


##### Remove Outliers

In [0]:
for x in numerical_features:
    lower, upper = outlier_detector(df, x)
    df = df.filter(F.col(x) > lower).filter(F.col(x) < upper)

df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- label: string (nullable = true)



###### Since there are no missing values, the imputation preprocessing steup is unnecessary.

##### Handle Class Imbalance

In [0]:
bal_schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("marital", StringType(), True),
    StructField("education", StringType(), True),
    StructField("default", StringType(), True),
    StructField("balance", IntegerType(), True),
    StructField("housing", StringType(), True),
    StructField("loan", StringType(), True),
    StructField("contact", StringType(), True),
    StructField("day", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("duration", IntegerType(), True),
    StructField("campaign", IntegerType(), True),
    StructField("previous", IntegerType(), True),
    StructField("poutcome", StringType(), True),
    StructField("label", StringType(), True),
])

unique_labels = ["no", "yes"]

df = balance_dataset(df, 
                    unique_labels,
                    bal_schema,
                    45000)

display(df)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,label
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,no
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,no
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,0,unknown,no


Output can only be rendered in Databricks

##### Convert Label Values From String to Integer

In [0]:
label_converter = {"no" : "0", "yes" : "1"}

df = df.replace(label_converter, subset=["label"]) \
                .withColumn("label", F.col("label").cast(DoubleType()))

display(df)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,label
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,0.0
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,0.0
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,0.0
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,0.0
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0,unknown,0.0
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,0.0
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,0.0
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,0.0
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,0.0
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,0,unknown,0.0


##### Check Summary Statistics One Last Time Before Processing

In [0]:
first_half = ["age", "job", "marital", "education", "default", "balance", "housing", "loan"]
second_half = [x for x in df.columns if x not in first_half]

df[first_half].summary().show()
df[second_half].summary().show()

+-------+------------------+-------+--------+---------+-------+-----------------+-------+-----+
|summary|               age|    job| marital|education|default|          balance|housing| loan|
+-------+------------------+-------+--------+---------+-------+-----------------+-------+-----+
|  count|             90681|  90681|   90681|    90681|  90681|            90681|  90681|90681|
|   mean| 40.09987759288054|   null|    null|     null|   null|741.6789404616183|   null| null|
| stddev|10.912176179813132|   null|    null|     null|   null|880.7369032151236|   null| null|
|    min|                18| admin.|divorced|  primary|     no|            -1941|     no|   no|
|    25%|                32|   null|    null|     null|   null|               96|   null| null|
|    50%|                38|   null|    null|     null|   null|              424|   null| null|
|    75%|                48|   null|    null|     null|   null|             1160|   null| null|
|    max|                70|unknown|  si

##### Prepare Dataset for Model

In [0]:
prepped_ds = prepare_data(df, categorical_features, numerical_features)

display(prepped_ds)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,label,job_index,job_class_vec,marital_index,marital_class_vec,education_index,education_class_vec,default_index,default_class_vec,housing_index,housing_class_vec,loan_index,loan_class_vec,contact_index,contact_class_vec,day_index,day_class_vec,month_index,month_class_vec,campaign_index,campaign_class_vec,previous_index,previous_class_vec,poutcome_index,poutcome_class_vec,features
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,0.0,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(0, 11, 14, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 2143.0, 261.0))"
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,0.0,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(0, 11, 14, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 2143.0, 261.0))"
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,0.0,2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(2, 12, 13, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 44.0, 29.0, 151.0))"
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,0.0,2.0,"Map(vectorType -> sparse, length -> 11, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(2, 12, 13, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 44.0, 29.0, 151.0))"
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0,unknown,0.0,9.0,"Map(vectorType -> sparse, length -> 11, indices -> List(9), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(9, 11, 13, 16, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 2.0, 76.0))"
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,0.0,1.0,"Map(vectorType -> sparse, length -> 11, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(1, 11, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 47.0, 1506.0, 92.0))"
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,0.0,1.0,"Map(vectorType -> sparse, length -> 11, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(1, 11, 16, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 47.0, 1506.0, 92.0))"
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,0.0,11.0,"Map(vectorType -> sparse, length -> 11, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(12, 16, 17, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 1.0, 198.0))"
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,0.0,11.0,"Map(vectorType -> sparse, length -> 11, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(12, 16, 17, 18, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 1.0, 198.0))"
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,0,unknown,0.0,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())",1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 30, indices -> List(2), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 42, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 36, indices -> List(0), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 146, indices -> List(0, 12, 14, 16, 20, 23, 51, 62, 104, 140, 143, 144, 145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 28.0, 447.0, 217.0))"


##### Split Dataset into Training & Testing Datasets

In [0]:
train_ds, test_ds = prepped_ds.randomSplit(weights=[0.80, 0.20], seed=42)

train_ds = train_ds.persist()
test_ds = test_ds.persist()

print(f"There are {train_ds.count()} samples in the training dataset.")
print(f"There are {test_ds.count()} samples in the testing dataset.")

There are 72270 samples in the training dataset.
There are 18411 samples in the testing dataset.


##### Instantiate Instance of Gradient Boosted Tree Classifier

In [0]:
gbt = GBTClassifier(featuresCol = 'features', 
                    labelCol = 'label', 
                    predictionCol = 'prediction', 
                    stepSize= 0.05, 
                    maxDepth=6, 
                    maxIter = 50)

##### Fit/Train Model Using Training Dataset

In [0]:
gbt_model = gbt.fit(train_ds)

##### Generate Predictions Using Testing Dataset

In [0]:
predictions = gbt_model.transform(test_ds)

##### Prepare Predictions for Metrics Evaluation Function

In [0]:
preds = predictions.select(F.col("label"), F.col("prediction"))

##### Calculate & Display Evaluations

In [0]:
metrics_to_eval = ["accuracy", "f1", "weightedPrecision", "weightedRecall"]

mc_evaluate_with_spark_metrics(preds,
                                metrics_to_eval, 
                                "Metrics for Banking Campaign")

+---------------------------------------------+
|         Metrics for Banking Campaign        |
+---------------------------------------------+
|                 Metric  |  Value            |
+---------------------------------------------+
|               accuracy  |  0.885286         |
+---------------------------------------------+
|                     f1  |  0.885097         |
+---------------------------------------------+
|      weightedPrecision  |  0.887281         |
+---------------------------------------------+
|         weightedRecall  |  0.885286         |
+---------------------------------------------+


### Notes & Other Takeaways From This Project
****
- Pretty good results!
****