# PySpark Cookbook

### Tomasz Drabas, Denny Lee
#### Version: 0.1
#### Date: 2/28/2018

# Loading the data

In [1]:
import pyspark.sql.functions as func
census_path = '../data/census_income.csv'

census = spark.read.csv(census_path, header=True, inferSchema=True)

for col, typ in census.dtypes:
    if typ == 'string':
        census = census.withColumn(col, func.ltrim(func.rtrim(census[col])))
census.count()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,,pyspark,idle,,,✔


SparkSession available as 'spark'.
32561

In [2]:
census.show()

+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
|age|       workclass|fnlwgt|   education|education-num|      marital-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|label|
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
| 39|       State-gov| 77516|   Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|        2174|           0|            40| United-States|<=50K|
| 50|Self-emp-not-inc| 83311|   Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|           0|           0|            13| United-States|<=50K|
| 38|     

In [3]:
census.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- label: string (nullable = true)

## Loading into RDD

In [4]:
census_rdd = sc.textFile(census_path)

header = census_rdd.first().split(',')
census_split = (
    census_rdd
    .map(lambda row: row.split(','))
    .map(lambda row: [e.strip() for e in row])
    .filter(lambda row: row != header) # remove header
)

census_split.take(1)

[['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States', '<=50K']]

# Exploring data

## Data prep

List of columns to keep

In [5]:
cols_to_keep = census.dtypes

cols_to_keep = (
    ['label','age', 'capital-gain', 'capital-loss','hours-per-week'] + 
    [e[0] for e in cols_to_keep[:-1] if e[1] == 'string']
)

cols_to_keep

['label', 'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Get numeric and categorical columns

In [6]:
import pyspark.mllib.stat as st
import numpy as np

census_subset = census.select(cols_to_keep)

cols_num = [e[0] for e in census_subset.dtypes if e[1] == 'int']
cols_cat = [e[0] for e in census_subset.dtypes[1:] if e[1] == 'string']
cols_num, cols_cat

(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

## Numerical data

In [7]:
rdd_num = census_subset.select(cols_num).rdd.map(lambda row: [e for e in row])
stats_num = st.Statistics.colStats(rdd_num)

for col, min_, mean_, max_, var_ in zip(
      cols_num
    , stats_num.min()
    , stats_num.mean()
    , stats_num.max()
    , stats_num.variance()
):
    print('{0}: min->{1:.1f}, mean->{2:.1f}, max->{3:.1f}, stdev->{4:.1f}'
          .format(col, min_, mean_, max_, np.sqrt(var_)))

age: min->17.0, mean->38.6, max->90.0, stdev->13.6
capital-gain: min->0.0, mean->1077.6, max->99999.0, stdev->7385.3
capital-loss: min->0.0, mean->87.3, max->4356.0, stdev->403.0
hours-per-week: min->1.0, mean->40.4, max->99.0, stdev->12.3

## Categorical data

In [57]:
rdd_cat = census_subset.select(cols_cat + ['label']).rdd.map(lambda row: [e for e in row])

results_cat = {}

for i, col in enumerate(cols_cat + ['label']):
    results_cat[col] = rdd_cat.groupBy(lambda row: row[i]).map(lambda el: (el[0], len(el[1]))).collect()

for k in results_cat:
    print(k, sorted(results_cat[k], key=lambda el: el[1], reverse=True), '\n')

sex [('Male', 21790), ('Female', 10771)] 

race [('White', 27816), ('Black', 3124), ('Asian-Pac-Islander', 1039), ('Amer-Indian-Eskimo', 311), ('Other', 271)] 

label [('<=50K', 24720), ('>50K', 7841)] 

native-country [('United-States', 29170), ('Mexico', 643), ('?', 583), ('Philippines', 198), ('Germany', 137), ('Canada', 121), ('Puerto-Rico', 114), ('El-Salvador', 106), ('India', 100), ('Cuba', 95), ('England', 90), ('Jamaica', 81), ('South', 80), ('China', 75), ('Italy', 73), ('Dominican-Republic', 70), ('Vietnam', 67), ('Guatemala', 64), ('Japan', 62), ('Poland', 60), ('Columbia', 59), ('Taiwan', 51), ('Haiti', 44), ('Iran', 43), ('Portugal', 37), ('Nicaragua', 34), ('Peru', 31), ('France', 29), ('Greece', 29), ('Ecuador', 28), ('Ireland', 24), ('Hong', 20), ('Trinadad&Tobago', 19), ('Cambodia', 19), ('Laos', 18), ('Thailand', 18), ('Yugoslavia', 16), ('Outlying-US(Guam-USVI-etc)', 14), ('Hungary', 13), ('Honduras', 13), ('Scotland', 12), ('Holand-Netherlands', 1)] 

marital-statu

## Correlations

In [9]:
correlations = st.Statistics.corr(rdd_num)

In [10]:
for i, el_i in enumerate(abs(correlations) > 0.05):
    print(cols_num[i])
    
    for j, el_j in enumerate(el_i):
        if el_j and j != i:
            print('    ', cols_num[j], correlations[i][j])
            
    print()

age
     capital-gain 0.077674498166
     capital-loss 0.057774539479
     hours-per-week 0.0687557075095

capital-gain
     age 0.077674498166
     hours-per-week 0.0784086153901

capital-loss
     age 0.057774539479
     hours-per-week 0.0542563622727

hours-per-week
     age 0.0687557075095
     capital-gain 0.0784086153901
     capital-loss 0.0542563622727

# Statistical testing

In [11]:
import pyspark.mllib.linalg as ln

census_occupation = census.groupby('label').pivot('occupation').count()

census_occupation_coll = (
    census_occupation
    .rdd
    .map(lambda row: (row[1:]))
    .flatMap(lambda row: row)
    .collect()
)

len_row = len(census_occupation.collect()[0]) - 1
dense_mat = ln.Matrices.dense(len_row, 2, census_occupation_coll)

chi_sq = st.Statistics.chiSqTest(dense_mat)

print(chi_sq.pValue)

0.0

# Transforming the data

Number of distinct values

In [12]:
len_ftrs = []

for col in cols_to_keep[5:]:
    len_ftrs.append((col, census.select(col).distinct().count()))
    
len_ftrs = dict(len_ftrs)

Using hashing trick

In [47]:
import pyspark.mllib.feature as feat

final_data = (
    census
    .select(cols_to_keep)
    .rdd
    .map(lambda row: [
        list(
            feat.HashingTF(int(len_ftrs[col] / 2.0))
            .transform(row[i])
            .toArray()
        ) if i > 4
        else [row[i]] 
        for i, col in enumerate(cols_to_keep)]
    )
)

final_data.take(3)

[[['<=50K'], [39], [2174], [0], [40], [1.0, 2.0, 1.0, 5.0], [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0], [2.0, 3.0, 8.0], [0.0, 3.0, 3.0, 1.0, 4.0, 1.0, 0.0], [5.0, 5.0, 3.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]], [['<=50K'], [50], [0], [0], [13], [4.0, 3.0, 1.0, 8.0], [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0], [5.0, 5.0, 8.0], [0.0, 1.0, 2.0, 2.0, 8.0, 1.0, 1.0], [4.0, 2.0, 1.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]], [['<=50K'], [38], [0], [0], [40], [2.0, 2.0, 0.0, 3.0], [2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0], [3.0, 2.0, 3.0], [2.0, 3.0, 1.0, 3.0, 7.0, 0.0, 1.0], [5.0, 5.0, 3.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]]]

Encode label

In [48]:
def labelEncode(label):
    return [int(label[0] == '>50K')]

final_data = final_data.map(lambda row: labelEncode(row[0]) + [item for sublist in row[1:]
                  for item in sublist])

# Creating an RDD for training

In [49]:
import pyspark.mllib.feature as ft
import pyspark.mllib.linalg as ln
import pyspark.mllib.regression as reg

final_data_income = final_data.map(lambda row: reg.LabeledPoint(row[0], ln.Vectors.dense(row[1:])))
final_data_income.take(2)

[LabeledPoint(0.0, [39.0,2174.0,0.0,40.0,1.0,2.0,1.0,5.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,3.0,8.0,0.0,3.0,3.0,1.0,4.0,1.0,0.0,5.0,5.0,3.0,3.0,2.0,4.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0]), LabeledPoint(0.0, [50.0,0.0,0.0,13.0,4.0,3.0,1.0,8.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,5.0,5.0,8.0,0.0,1.0,2.0,2.0,8.0,1.0,1.0,4.0,2.0,1.0,3.0,2.0,4.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0])]

In [50]:
final_data_hours = final_data.map(lambda row: reg.LabeledPoint(row[4], ln.Vectors.dense(row[0:4] + row[5:])))
final_data_hours.take(2)

[LabeledPoint(40.0, [0.0,39.0,2174.0,0.0,1.0,2.0,1.0,5.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,3.0,8.0,0.0,3.0,3.0,1.0,4.0,1.0,0.0,5.0,5.0,3.0,3.0,2.0,4.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0]), LabeledPoint(13.0, [0.0,50.0,0.0,0.0,4.0,3.0,1.0,8.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,5.0,5.0,8.0,0.0,1.0,2.0,2.0,8.0,1.0,1.0,4.0,2.0,1.0,3.0,2.0,4.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0])]

# Predicting hours of work for census respondents

Get some testing data

In [77]:
test_data_reg=sc.parallelize(final_data_hours.take(10))

Linear regression (benchmark)

In [51]:
workhours_model_lm = reg.LinearRegressionWithSGD.train(
    final_data_hours
    , iterations = 10
)

In [81]:
for t,p in zip(test_data_reg.map(lambda row: row.label).collect()
    , workhours_model_lm.predict(test_data_reg.map(lambda row: row.features)).collect()):
    print(t,p)

40.0 -2.9887777944e+74
13.0 -6.87179051853e+69
40.0 -5.20607963039e+69
40.0 -7.11940017011e+69
40.0 -4.01270942523e+69
40.0 -5.18608400327e+69
16.0 -6.68844518484e+69
45.0 -7.1064025535e+69
50.0 -1.93621350264e+75
40.0 -7.11855484288e+74

# Forecasting income levels of census respondents

Get some testing data

In [74]:
test_data_class=sc.parallelize(final_data_income.take(10))

Logistic regression

In [184]:
import pyspark.mllib.classification as cl

income_model_lr = cl.LogisticRegressionWithSGD.train(
    final_data_income
    , iterations=10
)

In [185]:
for t,p in zip(test_data_class.map(lambda row: row.label).collect()
    , income_model_lr.predict(test_data_class.map(lambda row: row.features)).collect()):
    print(t,p)

0.0 1
0.0 1
0.0 1
0.0 1
0.0 1
0.0 1
0.0 1
1.0 1
1.0 1
1.0 1

In [186]:
income_model_lr.threshold

0.5

In [59]:
income_model_lr.weights

DenseVector([1.6394, 303.7119, -7.1005, 1.384, 0.014, -0.0307, 0.0211, -0.0784, 0.1424, -0.0448, -0.1531, -0.0394, 0.0919, 0.0499, -0.331, -0.0099, 0.6931, 0.6594, -0.0666, -0.0079, -0.0741, -0.1695, -0.1866, -0.2795, -0.0557, 0.1218, -0.4953, -0.871, -1.113, -0.2151, -0.2842, -0.7633, -0.0624, -0.0036, -0.0076, -0.1912, -0.0009, -0.0617, -0.0032, -0.0018, 0.0027, -0.0138, 0.0, -0.1718, -0.0297, -0.0045, -0.0158, -0.0708, -0.0704, -0.142, -0.0733, -0.0848, 0.0])

Support Vector Machines

In [201]:
income_model_svm = cl.SVMWithSGD.train(
    final_data_income
    , iterations=100
    , step=0.98
    , miniBatchFraction=1/3.0
)

In [202]:
for t,p in zip(test_data_class.map(lambda row: row.label).collect()
    , income_model_svm.predict(test_data_class.map(lambda row: row.features)).collect()):
    print(t,p)

0.0 1
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
1.0 0
1.0 1
1.0 1

In [72]:
income_model_svm.weights

DenseVector([-3.7051, 75.158, 4.7048, -3.7744, -0.3049, -0.3383, -0.0043, -0.6313, -0.0741, -0.3845, -0.2785, -0.1044, 0.0738, 0.0609, -0.7295, -0.2406, 0.5397, 0.5315, -1.0844, -0.1711, -0.233, -0.4848, -0.6583, -1.1386, -0.2757, 0.1375, -1.3491, -1.7985, -2.1637, -0.7888, -0.814, -1.9143, -0.2324, -0.0074, -0.0149, -0.7041, -0.0034, -0.23, -0.0063, -0.0046, 0.0032, -0.0296, 0.0, -0.5619, -0.056, -0.0111, -0.0291, -0.252, -0.247, -0.4991, -0.2584, -0.2774, 0.0])

# Building clustering models

In [26]:
import pyspark.mllib.clustering as clu

model = clu.KMeans.train(
    final_data.map(lambda row: row[1:])
    , 2
    , maxIterations=10
    , initializationMode='random'
    , seed=666
    , initializationSteps=5
    , epsilon=1e-4
)

In [27]:
model.clusterCenters

[array([  3.85434850e+01,   5.92231436e+02,   8.77322388e+01,
         4.03915190e+01,   2.05295969e-01,   0.00000000e+00,
         3.56181717e-01,   2.18628480e-01,   1.50756126e+00,
         1.96318129e+00,   1.18974137e+00,   1.46497130e+00,
         9.49663601e-01,   9.96543423e-01,   1.64002222e-01,
         1.55576816e-01,   1.02771434e-02,   5.69933955e-01,
         5.26202086e-02,   3.41738164e-01,   1.56823653e+00,
         1.20140732e+00,   2.09039565e+00,   2.87327943e-02,
         2.42361583e-01,   0.00000000e+00,   8.57971730e-02,
         9.04234307e-01,   1.79309919e-02,   1.89408061e+00,
         1.04348497e+00,   2.07447071e+00,   2.72251713e+00,
         4.57169311e+00,   2.06564410e+00,   3.06153941e-02,
         2.02055429e-01,   1.05641627e+00,   1.11875810e-01,
         1.70643787e+00,   1.31069070e+00,   1.32584408e-01,
         7.11807913e-01,   2.76001481e-01,   1.60277143e+00,
         6.23480032e-01,   0.00000000e+00,   1.11906672e+00,
         0.00000000e+00

In [28]:
import sklearn.metrics as m

predicted = model.predict(final_data.map(lambda row: row[1:]))
predicted = predicted.collect()

true = final_data.map(lambda row: row[0]).collect()

print(m.homogeneity_score(true, predicted))
print(m.completeness_score(true, predicted))

0.0126632823359
0.226522171727

# Computing performance statistics

In [73]:
import pyspark.mllib.evaluation as ev

Regression metrics

In [129]:
true_reg = final_data_hours.map(lambda row: row.label).zipWithIndex().map(lambda row: (row[1], row[0]))
pred_reg = workhours_model_lm.predict(final_data_hours.map(lambda row: row.features)).zipWithIndex().map(lambda row: (row[1], float(row[0])))

true_pred_reg = pred_reg.join(true_reg).map(lambda el: el[1])

metrics_lm = ev.RegressionMetrics(true_pred)

In [130]:
print('R^2: ', metrics_lm.r2)
print('Explained Variance: ', metrics_lm.explainedVariance)
print('meanAbsoluteError: ', metrics_lm.meanAbsoluteError)

R^2:  -3.157194770951297
Explained Variance:  0.5521871925143494
meanAbsoluteError:  0.7600196554159885

Classification metrics

In [187]:
true_pred_class_lr = final_data_income.map(lambda row: (float(income_model_lr.predict(row.features)), row.label))

In [188]:
metrics_lr = ev.BinaryClassificationMetrics(true_pred_class_lr)

In [189]:
print('areaUnderPR: ', metrics_lr.areaUnderPR)
print('areaUnderROC: ', metrics_lr.areaUnderPR)

areaUnderPR:  0.5764236038031435
areaUnderROC:  0.5764236038031435

In [154]:
trainErr = true_pred_class_lr.filter(lambda lp: lp[0] != lp[1]).count() / float(true_pred_class_lr.count())
print("Training Error = " + str(trainErr))

Training Error = 0.7600196554159885

In [204]:
true_pred_class_svm = final_data_income.map(lambda row: (float(income_model_svm.predict(row.features)), row.label))

metrics_svm = ev.BinaryClassificationMetrics(true_pred_class_svm)

print('areaUnderPR: ', metrics_svm.areaUnderPR)
print('areaUnderROC: ', metrics_svm.areaUnderPR)

areaUnderPR:  0.5741069899450867
areaUnderROC:  0.5741069899450867

In [205]:
trainErr = true_pred_class_svm.filter(lambda lp: lp[0] != lp[1]).count() / float(true_pred_class_svm.count())
print("Training Error = " + str(trainErr))

Training Error = 0.22112342987008998

In [203]:
true_pred_class_svm.map(lambda el: ((el), 1)).reduceByKey(lambda x,y: x+y).take(4)

[((0.0, 1.0), 4120), ((1.0, 0.0), 3080), ((0.0, 0.0), 21640), ((1.0, 1.0), 3721)]

In [196]:
true_pred_class_lr.map(lambda el: ((el), 1)).reduceByKey(lambda x,y: x+y).take(4)

[((0.0, 1.0), 773), ((1.0, 0.0), 23974), ((0.0, 0.0), 746), ((1.0, 1.0), 7068)]