## Assignment: Train GBM with Cross Validation

### University of California, Santa Barbara  
### PSTAT 135/235 - Big Data Analytics 
### Last Updated: January 29, 2019

---  

### MODULES

In [None]:
import os
import pandas as pd

In [None]:
import pyspark.sql.functions as F
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.sql.types import *

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("data preprocessing") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

### PARAMETERS

In [None]:
path_to_data = os.path.join('/home/jovyan/UCSB_BigDataAnalytics/data/brca/breast_cancer_wisconsin.csv')

In [None]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'class'
positive_label = 4
negative_label = 2

SEED = 314
training_fraction = 0.6
ITERS = 10
FOLDS = 5

### READ IN DATA

In [None]:
brca = spark.read.csv(path_to_data, header=True, inferSchema=True)

In [None]:
brca.printSchema()

In [None]:
print('rows={},columns={}'.format(brca.count(),len(brca.columns)))

In [None]:
# compute distribution of target variable
brca.groupBy(target).count().show()

In [None]:
# map target labels to 0/1
brca = brca.withColumn(target,F.when(brca[target] == positive_label, 1).otherwise(0))

In [None]:
data_train, data_test = brca.randomSplit([training_fraction, 1 - training_fraction], seed=SEED)

In [None]:
print('records_total={},records_train={},records_test={}'.format(brca.count(),data_train.count(),data_test.count()))

In [None]:
vars_to_keep = [
 'clump_thickness',
 'uniformity_cell_size',
 'uniformity_cell_shape',
 'marginal_adhesion',
 'single_epithelial_cell_size'
]

In [None]:
assembler = VectorAssembler(
                            inputCols=[c for c in data_train.columns if c in vars_to_keep],
                            outputCol='features')

In [None]:
data_train = (assembler.transform(data_train).select(target, "features"))

In [None]:
data_train.show(3)

In [None]:
gbt = GBTClassifier(labelCol=target, featuresCol="features", maxIter=ITERS)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol=target)

In [None]:
paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxIter, [1, 5]) \
            .addGrid(gbt.maxDepth, [1, 2]) \
            .build()

In [None]:
# k-fold cross validation
crossval = CrossValidator(
            estimator=gbt, 
            estimatorParamMaps=paramGrid, 
            evaluator=evaluator, 
            numFolds=FOLDS)

In [None]:
model = crossval.fit(data_train)

In [None]:
predictions = model.transform(
    assembler.transform(data_test).select("features"))

In [None]:
model.bestModel

In [None]:
feat_imp = DenseVector(model.bestModel.featureImportances); feat_imp

In [None]:
# pack variables, feature importances into pandas dataframe

df_feat_imp = pd.DataFrame(index=vars_to_keep, columns=['feature_importance'], data=feat_imp[:])
df_feat_imp.sort_values(by='feature_importance', ascending=False, inplace=True)

In [None]:
df_feat_imp