## Assignment: Train GBM with Cross Validation

### University of California, Santa Barbara  
### PSTAT 135/235  
### Last Updated: Oct 31, 2018

---  

### MODULES

In [1]:
import os
import pandas as pd

In [2]:
import pyspark.sql.functions as F
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.sql.types import *

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("data preprocessing") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

### PARAMETERS

In [4]:
path_to_data = os.path.join('/home/jovyan/work/data/brca/breast_cancer_wisconsin.csv')

In [None]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'class'
positive_label = 4
negative_label = 2

SEED = 314
training_fraction = 0.6
ITERS = 10
FOLDS = 5

### READ IN DATA

In [6]:
brca = spark.read.csv(path_to_data, header=True, inferSchema=True)

In [7]:
brca.printSchema()

root
 |-- sample_code_number: integer (nullable = true)
 |-- clump_thickness: integer (nullable = true)
 |-- uniformity_cell_size: integer (nullable = true)
 |-- uniformity_cell_shape: integer (nullable = true)
 |-- marginal_adhesion: integer (nullable = true)
 |-- single_epithelial_cell_size: integer (nullable = true)
 |-- bare_nuclei: string (nullable = true)
 |-- bland_chromatin: integer (nullable = true)
 |-- normal_nucleoli: integer (nullable = true)
 |-- mitosis: integer (nullable = true)
 |-- class: integer (nullable = true)



In [8]:
print('rows={},columns={}'.format(brca.count(),len(brca.columns)))

rows=699,columns=11


In [9]:
# compute distribution of target variable
brca.groupBy(target).count().show()

+-----+-----+
|class|count|
+-----+-----+
|    4|  241|
|    2|  458|
+-----+-----+



In [10]:
# map target labels to 0/1

brca = brca.withColumn(target,F.when(brca[target] == positive_label, 1).otherwise(0))

In [11]:
data_train, data_test = brca.randomSplit([training_fraction, 1 - training_fraction], seed=SEED)

In [12]:
print('records_total={},records_train={},records_test={}'.format(brca.count(),data_train.count(),data_test.count()))

records_total=699,records_train=420,records_test=279


In [13]:
vars_to_keep = [
 'clump_thickness',
 'uniformity_cell_size',
 'uniformity_cell_shape',
 'marginal_adhesion',
 'single_epithelial_cell_size'
]

In [14]:
assembler = VectorAssembler(
                            inputCols=[c for c in data_train.columns if c in vars_to_keep],
                            outputCol='features')

In [15]:
data_train = (assembler.transform(data_train).select(target, "features"))

In [16]:
data_train.show(3)

+-----+--------------------+
|class|            features|
+-----+--------------------+
|    0|[5.0,4.0,3.0,1.0,...|
|    1|[9.0,1.0,2.0,6.0,...|
|    1|[10.0,4.0,7.0,2.0...|
+-----+--------------------+
only showing top 3 rows



In [17]:
gbt = GBTClassifier(labelCol=target, featuresCol="features", maxIter=ITERS)

In [18]:
evaluator = BinaryClassificationEvaluator(labelCol=target)

In [19]:
paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxIter, [1, 5]) \
            .addGrid(gbt.maxDepth, [1, 2]) \
            .build()

In [20]:
# k-fold cross validation
crossval = CrossValidator(
            estimator=gbt, 
            estimatorParamMaps=paramGrid, 
            evaluator=evaluator, 
            numFolds=FOLDS)

In [21]:
model = crossval.fit(data_train)

In [81]:
predictions = model.transform(
    assembler.transform(data_test).select("features"))

In [22]:
model.bestModel

GBTClassificationModel (uid=GBTClassifier_46cdbda6c695cf72bce1) with 5 trees

In [173]:
feat_imp = DenseVector(model.bestModel.featureImportances); feat_imp

DenseVector([0.0791, 0.1913, 0.3531, 0.1058, 0.2706])

In [174]:
# pack variables, feature importances into pandas dataframe

df_feat_imp = pd.DataFrame(index=vars_to_keep, columns=['feature_importance'], data=feat_imp[:])
df_feat_imp.sort_values(by='feature_importance', ascending=False, inplace=True)

In [175]:
df_feat_imp

Unnamed: 0,feature_importance
uniformity_cell_shape,0.353109
single_epithelial_cell_size,0.270604
uniformity_cell_size,0.191308
marginal_adhesion,0.105838
clump_thickness,0.07914
