In [None]:
from pydataset import data
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

from wrangle import wrangle_311

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data('tips'))

In [None]:
train, test = df.randomSplit([0.8, 0.2], seed=123)
# train, validate, test = df.randomSplit([0.6, 0.2, 0.2])

In [None]:
print('train shape')
train.count(), len(train.columns)

In [None]:
print('test shape')
test.count(), len(test.columns)

## Regression

- `tip ~ total_bill`: predict tip based on total bill
- `tip ~ total_bill + size`: predict tip based on total bill and size
- `tip ~ .`: predict tip based on all the other features in the dataset

In [None]:
# nb: spark's rformula does encoding
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)

train_input = rf.transform(train).select('features', 'label')
train_input.show(3)

In [None]:
lr = pyspark.ml.regression.LinearRegression()
# print(lr.explainParams())
lr_fit = lr.fit(train_input)
lr_fit.transform(train_input).show(3)

In [None]:
lr_fit.summary.r2, lr_fit.summary.rootMeanSquaredError

How do we do on the test data?

In [None]:
test_input = rf.transform(test)
lr_fit.transform(test_input).show(4)

In [None]:
evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
'{:.4}'.format(rmse)

## Classification

In [None]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train).select('features', 'label')
train_input.show(4)

In [None]:
lr = pyspark.ml.classification.LogisticRegression()
lr_fit = lr.fit(train_input)

In [None]:
# area under TPR (recall) vs FPR (FP / (FP + TN)) curve
lr_fit.summary.areaUnderROC

In [None]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc