## Coding functions to support several supervised learning tasks.  

### MODULES

In [1]:
import os

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("data preprocessing") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

### PARAMETERS

In [3]:
# class = 2 for benign (negative class), 4 for malignant (positive class)
target = 'class'
positive_label = 4
negative_label = 2

SEED = 314

In [4]:
brca = spark.read.csv('breast_cancer_wisconsin.csv', header=True, inferSchema=True)

In [6]:
brca.count()

699

In [7]:
# compute distribution of target variable
brca.groupBy(target).count().show()

+-----+-----+
|class|count|
+-----+-----+
|    4|  241|
|    2|  458|
+-----+-----+



In [8]:
poscount = brca.groupBy(target).count().filter(brca['class'] == 4)
negcount = brca.groupBy(target).count().filter(brca['class'] == 2)

In [9]:
negcount.collect()[0][1]

458

### Function to implement downsampling:

In [10]:
brca.filter(brca['class'] == 4).sample(.25, SEED)

DataFrame[id: int, clump_thickness: int, uniformity_cell_size: int, uniformity_cell_shape: int, marginal_adhesion: int, single_epithelial_cell_size: int, bare_nuclei: string, bland_chromatin: int, normal_nucleoli: int, mitoses: int, class: int]

In [11]:
def downsample(df, target, positive_label, negative_label):
    """
    df              spark dataframe
    target          str, target variable
    positive_label  int, value of positive label
    negative_label  int, value of negative label
    
    """

    seed=311
    
    # find larger value
    poscount = df.groupBy(target).count().filter(df[target] == 4).collect()[0][1]
    negcount = df.groupBy(target).count().filter(df[target] == 2).collect()[0][1]
    
    if poscount > negcount:
        larger_value = poscount
        lower_value = negcount
        sampler = 4
        not_sample = 2
    elif poscount < negcount:
        larger_value = negcount
        lower_value = poscount
        sampler = 2
        not_sample = 4
    
    percent_to_sample = lower_value/larger_value
    print(percent_to_sample)
    print(larger_value)
    
    
    new_sample = df.filter(df[target] == sampler).sample(percent_to_sample, seed)
    
    old_sample = df.filter(df[target] == not_sample)
    df_b = new_sample.union(old_sample)

    return df_b

In [12]:
# because Spark uses Bernouilli Sampling, each row is assigned a probability of being included
# that's why the fraction is not exactly how I call it as above
downsample(brca, target, positive_label, negative_label).groupBy(target).count().show()

0.5262008733624454
458
+-----+-----+
|class|count|
+-----+-----+
|    4|  241|
|    2|  253|
+-----+-----+



### Univariate AUC Measurement (of select features):

In [13]:
# load modules
import pandas as pd
import pyspark.sql.functions as F
import pyspark.mllib.regression as reg
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics


In [14]:
# parameters
training_fraction = .6
ITERS = 10

In [15]:
# narrow the list of features for modeling
vars_to_keep = ['clump_thickness', 'uniformity_cell_size', 
                'uniformity_cell_shape', 'marginal_adhesion', 'single_epithelial_cell_size']

In [16]:
brca_f = brca.select([target]+vars_to_keep)

In [17]:
# Show some rows to be sure things look ok.
brca_f.show(10)

+-----+---------------+--------------------+---------------------+-----------------+---------------------------+
|class|clump_thickness|uniformity_cell_size|uniformity_cell_shape|marginal_adhesion|single_epithelial_cell_size|
+-----+---------------+--------------------+---------------------+-----------------+---------------------------+
|    2|              5|                   1|                    1|                1|                          2|
|    2|              5|                   4|                    4|                5|                          7|
|    2|              3|                   1|                    1|                1|                          2|
|    2|              6|                   8|                    8|                1|                          3|
|    2|              4|                   1|                    1|                3|                          2|
|    4|              8|                  10|                   10|                8|            

In [18]:
# map target labels to 0/1
brca_f = brca_f.withColumn(target, F.when(brca_f[target] == positive_label, 1).otherwise(0))

In [19]:
brca_f.groupBy(target).count().show()


+-----+-----+
|class|count|
+-----+-----+
|    1|  241|
|    0|  458|
+-----+-----+



In [20]:
brca_f.dtypes

[('class', 'int'),
 ('clump_thickness', 'int'),
 ('uniformity_cell_size', 'int'),
 ('uniformity_cell_shape', 'int'),
 ('marginal_adhesion', 'int'),
 ('single_epithelial_cell_size', 'int')]

In [21]:
### TEST ###
data_train, data_test = brca_f.randomSplit([.6,.4], 311)
data_train.show(5)

+-----+---------------+--------------------+---------------------+-----------------+---------------------------+
|class|clump_thickness|uniformity_cell_size|uniformity_cell_shape|marginal_adhesion|single_epithelial_cell_size|
+-----+---------------+--------------------+---------------------+-----------------+---------------------------+
|    0|              1|                   1|                    1|                1|                          1|
|    0|              1|                   1|                    1|                1|                          1|
|    0|              1|                   1|                    1|                1|                          1|
|    0|              1|                   1|                    1|                1|                          1|
|    0|              1|                   1|                    1|                1|                          1|
+-----+---------------+--------------------+---------------------+-----------------+------------

In [22]:
data_train.select('class', 'clump_thickness').show(5)

+-----+---------------+
|class|clump_thickness|
+-----+---------------+
|    0|              1|
|    0|              1|
|    0|              1|
|    0|              1|
|    0|              1|
+-----+---------------+
only showing top 5 rows



In [23]:
def compute_univariate_aucs(df, target, training_fraction, iters, seed):

    # train/test data
    train_test = [training_fraction, 1-training_fraction]
    data_train, data_test = df.randomSplit(train_test, seed)
    
    # predictor variables
    vars = ['clump_thickness', 'uniformity_cell_size', 
            'uniformity_cell_shape', 'marginal_adhesion', 'single_epithelial_cell_size']
    
    # results storage
    df_auc = pd.DataFrame(index=vars, columns=['weight','auroc'])    

    for v in vars:    
        print('=== analysis of variable: {}'.format(v))

        # create train and test dataframes with columns: target, v
        datai_tr = data_train.select(target, v)
        datai_te = data_test.select(target, v)

        # cast to LabeledPoint
        # train
        datai_tr_lp = datai_tr \
                     .rdd \
                     .map(lambda row: reg.LabeledPoint(row[0], row[1:]))
        
        # test
        datai_te_lp = datai_te \
                     .rdd \
                     .map(lambda row: reg.LabeledPoint(row[0], row[1:]))

        
        # train logistic regression
        LR_Model = LogisticRegressionWithLBFGS.train(datai_tr_lp, iterations=iters, intercept=True)

        #zip labels with predicted labels and cast to float
        act_pred_test_set = datai_te_lp.map(lambda p: float(p.label)) \
                             .zip(LR_Model.predict(datai_te_lp \
                             .map(lambda p: p.features))) \
                             .map(lambda row: (row[0], row[1] * 1.0))
        
        metrics = BinaryClassificationMetrics(act_pred_test_set)
        
        df_auc['weight'].loc[v] = LR_Model.weights # store the weights
        df_auc['auroc'].loc[v] = metrics.areaUnderROC # extract AUROC
        print('=== completed analysis of variable: {}'.format(v))
        
    df_auc.sort_values(by='auroc', ascending=False, inplace=True)
    
    return df_auc

In [24]:
compute_univariate_aucs(brca_f, target, training_fraction, ITERS, SEED)

=== analysis of variable: clump_thickness
=== completed analysis of variable: clump_thickness
=== analysis of variable: uniformity_cell_size
=== completed analysis of variable: uniformity_cell_size
=== analysis of variable: uniformity_cell_shape
=== completed analysis of variable: uniformity_cell_shape
=== analysis of variable: marginal_adhesion
=== completed analysis of variable: marginal_adhesion
=== analysis of variable: single_epithelial_cell_size
=== completed analysis of variable: single_epithelial_cell_size


Unnamed: 0,weight,auroc
uniformity_cell_size,[1.5840136875706783],0.916057
uniformity_cell_shape,[1.4606620376397053],0.91489
clump_thickness,[0.9759376678175061],0.88742
marginal_adhesion,[1.0115025322990916],0.871945
single_epithelial_cell_size,[1.3059294130781745],0.866864


In [None]:
# !jupyter nbconvert --to pdf`pwd`/*.ipynb