In [16]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pyspark.sql
import sys
from pyspark.sql.functions import lit
# spark.version

In [2]:
spark.version

'3.1.2'

In [3]:
spark = SparkSession.builder.appName("mvp-prediction").getOrCreate()
print('spark session created')

spark session created


1. Data loading

In [55]:
data_paths = ['gs://6893-data/triLabel/player_stats_2018_2019.csv',
              'gs://6893-data/triLabel/player_stats_2019_2020.csv', 
              'gs://6893-data/triLabel/player_stats_2020_2021.csv',
              'gs://6893-data/triLabel/player_stats_2021_2022.csv'
]
data_1 = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(data_paths[0])
data_2 = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(data_paths[1])
data_3 = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(data_paths[2])

data = data_1.union(data_2)
data = data.union(data_3)
ongoing_data = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(data_paths[3])
# print(data)

In [56]:
print('Overall shape of data', data.count(), len(data.columns))
print('2018-2019 dimension of data', data_1.count(), len(data_1.columns))
print('2019-2020 dimension of data', data_2.count(), len(data_2.columns))
print('2020-2021 dimension of data', data_3.count(), len(data_3.columns))

Overall shape of data 1599 11
2018-2019 dimension of data 530 11
2019-2020 dimension of data 529 11
2020-2021 dimension of data 540 11


In [68]:
mip_datapaths = [
             'gs://6893-data/mipData/player_improvement_2019_2020.csv',
             'gs://6893-data/mipData/player_improvement_2020_2021.csv',
             'gs://6893-data/mipData/player_improvement_2021_2022.csv'
]
mip_data_1 = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(mip_datapaths[0])
mip_data_2 = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(mip_datapaths[1])

mip_data = mip_data_1.union(mip_data_2)
ongoing_mip_data = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(mip_datapaths[2])

In [69]:
print('Overall shape of data', mip_data.count(), len(mip_data.columns))
print('2019~2020 dimension of data', mip_data_1.count(), len(mip_data_1.columns))
print('2020~2021 dimension of data', mip_data_2.count(), len(mip_data_2.columns))
print('2021~2022 dimension of data', ongoing_mip_data.count(), len(ongoing_mip_data.columns))
print(mip_data)

Overall shape of data 811 7
2019~2020 dimension of data 400 7
2020~2021 dimension of data 411 7
2021~2022 dimension of data 369 7
DataFrame[reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, label: int]


2. Data preprocessing

In [70]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [71]:
#stages in our Pipeline
stages = []

In [72]:
# Transform all features into a vector using VectorAssembler
numericCols = ["reb", "ast", "stl", "blk", "tov", "pts"]
assemblerInputs = numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [73]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(data)
preppedDataDF = pipelineModel.transform(data)
preppedOngoingDF = pipelineModel.transform(ongoing_data)

In [74]:
pipelineModelMip = pipeline.fit(mip_data)
preppedDataMip = pipelineModelMip.transform(mip_data)
preppedOngoingMip = pipelineModelMip.transform(ongoing_mip_data)
preppedDataMip.take(3)
# preppedOngoingMip.take(3)

[Row(reb=-0.021419168986074526, ast=0.49997688444709554, stl=-0.4428101732219239, blk=0.09920308478640266, tov=-0.15635161427385222, pts=-0.2329237762816053, label=0, features=DenseVector([-0.0214, 0.5, -0.4428, 0.0992, -0.1564, -0.2329])),
 Row(reb=0.4085906755386262, ast=1.285148252475082, stl=0.32403712897915865, blk=0.5999857562190266, tov=0.8320552891480542, pts=0.7518776711044055, label=1, features=DenseVector([0.4086, 1.2851, 0.324, 0.6, 0.8321, 0.7519])),
 Row(reb=-0.19106129091453897, ast=0.01954522067931028, stl=0.2879691836603197, blk=0.22014566525875912, tov=-0.25062815521712517, pts=-0.13319366185627537, label=0, features=DenseVector([-0.1911, 0.0195, 0.288, 0.2201, -0.2506, -0.1332]))]

In [75]:
preppedDataDF.take(3)
preppedOngoingDF.take(3)

[Row(id='84b7ddae-180a-464f-9962-2c3b3222810c', name='Precious Achiuwa', reb=8.125, ast=1.625, stl=0.5625, blk=0.625, tov=1.0, pts=8.25, src='https://cdn.nba.com/headshots/nba/latest/260x190/1630173.png', teamSrc='https://cdn.nba.com/logos/nba/1610612761/primary/L/logo.svg', team='Toronto Raptors', newsUrl=None, newsTitle=None, newsDate=None, newsIntro=None, features=DenseVector([8.125, 1.625, 0.5625, 0.625, 1.0, 8.25])),
 Row(id='090e9e99-10df-4aaf-b1ff-2645a8420aa5', name='Steven Adams', reb=8.111111111, ast=2.777777778, stl=0.833333333, blk=0.722222222, tov=1.5, pts=7.0, src='https://cdn.nba.com/headshots/nba/latest/260x190/203500.png', teamSrc='https://cdn.nba.com/logos/nba/1610612763/primary/L/logo.svg', team='Memphis Grizzlies', newsUrl=None, newsTitle=None, newsDate=None, newsIntro=None, features=DenseVector([8.1111, 2.7778, 0.8333, 0.7222, 1.5, 7.0])),
 Row(id='100b9996-96db-4e10-b29b-e449d2413d95', name='Bam Adebayo', reb=10.1875, ast=2.8125, stl=1.1875, blk=0.375, tov=2.9375,

In [76]:
# Keep relevant columns

cols = data.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)

ongoing_dataset = preppedOngoingDF.select(['features', 'id', 'name', 'reb', 'ast', 'stl', 'blk', 'tov', 'pts'])

display(dataset)
display(ongoing_dataset)

DataFrame[features: vector, id: string, name: string, mvpLabel: int, mipLabel: int, dpoyLabel: int, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double]

DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double]

In [77]:
print(selectedcols)
print(ongoing_dataset.count())

['features', 'id', 'name', 'mvpLabel', 'mipLabel', 'dpoyLabel', 'reb', 'ast', 'stl', 'blk', 'tov', 'pts']
466


In [128]:
dataset_mip = preppedDataMip.select(['features','label', 'reb', 'ast', 'stl', 'blk', 'tov', 'pts'])
# dataset_mip.show(300)
dataset_ongoing_mip = preppedOngoingMip.select(['features', 'name', 'reb', 'ast', 'stl', 'blk', 'tov', 'pts'])
display(dataset_mip)
display(dataset_ongoing_mip)

DataFrame[features: vector, label: int, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double]

DataFrame[features: vector, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double]

In [129]:
### Randomly split data into training and test sets. set seed for reproducibility
#=====your code here==========
trainingData, testData = dataset.randomSplit([.80, .20], seed=100)
#===============================
print("trainingData length = ", trainingData.count())
print("testData length = ", testData.count())

trainingData length =  1258
testData length =  341


In [132]:
trainingData_mip, testData_mip = dataset_mip.randomSplit([.80, .20], seed=100)
print("trainingData_mip length = ", trainingData_mip.count())
print("testData_mip length = ", testData_mip.count())
trainingData_mip.show(300)
testData_mip.show(300)

trainingData_mip length =  653
testData_mip length =  158
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            features|label|                 reb|                 ast|                 stl|                 blk|                 tov|                 pts|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[-0.9999322943711...|    0|  -0.999932294371154| -0.9999398894154533|                 0.0|                 0.0|                 0.0| -0.9999189819893424|
|[-0.9999322943711...|    0|  -0.999932294371154| -0.7133078705373324| -0.9998634933910959|                 0.0| -0.3638854186585195| -0.7974937740183787|
|[-0.9999235331362...|    0| -0.9999235331362399| -0.1588437103362509| -0.9999669040661644| -0.9999310768536014| -0.9999802735881309|-0.38359667759217086|
|[-0.8366423

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            features|label|                 reb|                 ast|                 stl|                 blk|                 tov|                 pts|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[-0.8674849064789...|    0| -0.8674849064789091| -0.8279842615374541|  0.2942101873341761| -0.9999612295610717| -0.7273636148150675| -0.8322230918304456|
|[-0.6172602697575...|    0| -0.6172602697575337| -0.9998797860570372|  1.6842040688571973| -0.9999483067494856| -0.3638780164933022| -0.6737933684179215|
|[-0.5603008975309...|    0| -0.5603008975309867| -0.4290290394305968|-0.08479450998365275|-0.22531100282436256| -0.4393136159134671| -0.6941089972270039|
|[-0.5431259423620...|    0| -0.5431259423620787| -0.6557908711325494|

3. Modeling

In [131]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [123]:
import datetime
from pytz import timezone
tz = timezone('EST')
now = datetime.datetime.now(tz)

# Fit model to prepped data

#LogisticRegression model, maxIter=10
#=====your code here============
lrModel_mvp = LogisticRegression(featuresCol="features", labelCol="mvpLabel", maxIter=10).fit(trainingData)
lrModel_dpoy = LogisticRegression(featuresCol="features", labelCol="dpoyLabel", maxIter=10).fit(trainingData)
lrModel_mip = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10).fit(trainingData_mip)
#===============================

mvp_predictions = lrModel_mvp.transform(testData)
dpoy_predictions = lrModel_dpoy.transform(testData)
mip_predictions = lrModel_mip.transform(testData_mip)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mip_predictions)
print("Test set accuracy for mip prediction = " + str(accuracy))

mvp_predictions = lrModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['id', 'name', 'probability'])
mvp_selected = mvp_selected.withColumnRenamed('probability', 'mvp_prediction').withColumn('date', lit(str(now)[:10]))

# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = lrModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['id', 'name', 'probability'])
dpoy_selected = dpoy_selected.withColumnRenamed('probability', 'dpoy_prediction').withColumn('date', lit(str(now)[:10]))

mip_predictions = lrModel_mip.transform(dataset_ongoing_mip)
print(mip_predictions)
print('mip dataset ongoing')
dataset_ongoing_mip.show(5)
print('training dataset mip')
trainingData_mip.show(20)
mip_selected = mip_predictions.select(['name', 'probability', 'prediction'])
# mip_selected.show(mip_selected.count())
mip_selected = mip_selected.withColumnRenamed('probability', 'mip_prediction').withColumn('date', lit(str(now)[:10]))

Test set accuracy for mvp prediction = 0.9853372434017595
Test set accuracy for dpoy prediction = 0.9765395894428153
Test set accuracy for mip prediction = 0.9746835443037974
DataFrame[features: vector, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]
mip dataset ongoing
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|            features|                name|                 reb|                 ast|                 stl|                 blk|                tov|                 pts|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|[1.50000283434582...|    Precious Achiuwa|  1.5000028343458263|  2.9410628247377515|  0.7057545851065954|  0.4310

In [135]:
print(now)
print(mvp_selected)
print(mvp_selected.count(), 'lines in mvp_prediction')
print(dpoy_selected)
print(dpoy_selected.count(), 'lines in dpoy_prediction')
print(mip_selected)
print(mip_selected.count(), 'lines in mip_prediction')

mvp_pd = mvp_selected.toPandas()
for idx, row in mvp_pd.iterrows():
    row.mvp_prediction = row.mvp_prediction[1]
    # print(row)
print(mvp_pd.head(3))
mvp_pd.to_csv('mvp_prediction_results.csv')

dpoy_pd = dpoy_selected.toPandas()
for idx, row in dpoy_pd.iterrows():
    row.dpoy_prediction = row.dpoy_prediction[1]
    # print(row)
print(dpoy_pd.head(3))
dpoy_pd.to_csv('dpoy_prediction_results.csv')

mip_pd = mip_selected.toPandas()
# print(mip_pd)
for idx, row in mip_pd.iterrows():
    row.mip_prediction = row.mip_prediction[1]
    # print(row)
print(mip_pd.head(3))
mip_pd.to_csv('mip_prediction_results.csv')

2021-12-16 22:50:17.449223-05:00
DataFrame[name: string, probability: vector, prediction: double, blk: double]
466 lines in mvp_prediction
DataFrame[name: string, probability: vector, prediction: double, blk: double]
466 lines in dpoy_prediction
DataFrame[name: string, probability: vector, prediction: double]
369 lines in mip_prediction


AttributeError: 'Series' object has no attribute 'mvp_prediction'

In [106]:
from pytz import timezone
tz = timezone('EST')
datetime.datetime.now(tz) 

datetime.datetime(2021, 12, 16, 22, 30, 35, 962226, tzinfo=<StaticTzInfo 'EST'>)

In [22]:
import os
import sys
# os.getcwd()

In [134]:
from pyspark.ml.classification import RandomForestClassifier
# Random Forest
#=====your code here==========
rfModel_mvp = RandomForestClassifier(featuresCol="features", labelCol="mvpLabel").fit(trainingData)
rfModel_dpoy = RandomForestClassifier(featuresCol="features", labelCol="dpoyLabel").fit(trainingData)
rfModel_mip = RandomForestClassifier(featuresCol="features", labelCol="label").fit(trainingData_mip)
#===============================


mvp_predictions = rfModel_mvp.transform(testData)
dpoy_predictions = rfModel_dpoy.transform(testData)
mip_predictions = rfModel_mip.transform(testData_mip)
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mip_predictions)
print("Test set accuracy for mip prediction = " + str(accuracy))

mvp_predictions = rfModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'probability', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = rfModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'probability', 'prediction', 'blk'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

mip_predictions = rfModel_mip.transform(dataset_ongoing_mip)
mip_selected = mip_predictions.select(['name', 'probability', 'prediction'])
# dpoy_selected.show(dpoy_selected.count(), False)
mip_selected.show(30)

Test set accuracy for mvp prediction = 0.9824046920821115
Test set accuracy for dpoy prediction = 0.9706744868035191
Test set accuracy for mip prediction = 0.9746835443037974
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]
+--------------------+--------------------+----------+
|                name|         probability|prediction|
+--------------------+--------------------+----------+
|    Precious Achiuwa|[0.93555651664983...|       0.0|
|        Steven Adams|[0.99276932793234...|       0.0|
|         Bam Adebayo|[0.78091346870495...|       0.0|
|   LaMarcus Aldridge|[0.68944932849460...|       0.0|
|Nickeil Alexander...|[0.78589958167306...|       0.0|
|       Grayson Allen|[0.77375226218897...|       0.0|
|       Jarrett Allen|[0.78395586100681...|       0.0|
|       Kyle Anderson|[0.99895557013068...|       0.0|
|Giannis Antetokou...|[0

In [142]:
mip_selected = mip_selected.withColumnRenamed('probability', 'mip_prediction').withColumn('date', lit(str(now)[:10]))
mip_selected = mip_selected.select(['name', 'mip_prediction', 'date'])
mip_pd = mip_selected.toPandas()
# print(mip_pd)
for idx, row in mip_pd.iterrows():
    row.mip_prediction = row.mip_prediction[1]/0.35
    print(row)
print(mip_pd.head(3))
mip_pd.to_csv('mip_prediction_results.csv')

name              Precious Achiuwa
mip_prediction            0.184124
date                    2021-12-16
Name: 0, dtype: object
name              Steven Adams
mip_prediction        0.020659
date                2021-12-16
Name: 1, dtype: object
name              Bam Adebayo
mip_prediction       0.625962
date               2021-12-16
Name: 2, dtype: object
name              LaMarcus Aldridge
mip_prediction             0.887288
date                     2021-12-16
Name: 3, dtype: object
name              Nickeil Alexander-Walker
mip_prediction                    0.611715
date                            2021-12-16
Name: 4, dtype: object
name              Grayson Allen
mip_prediction         0.646422
date                 2021-12-16
Name: 5, dtype: object
name              Jarrett Allen
mip_prediction         0.617269
date                 2021-12-16
Name: 6, dtype: object
name              Kyle Anderson
mip_prediction         0.002984
date                 2021-12-16
Name: 7, dtype: object
nam

name              Josh Richardson
mip_prediction            0.16698
date                   2021-12-16
Name: 296, dtype: object
name              Austin Rivers
mip_prediction         0.003095
date                 2021-12-16
Name: 297, dtype: object
name              Duncan Robinson
mip_prediction           0.020292
date                   2021-12-16
Name: 298, dtype: object
name              Justin Robinson
mip_prediction           0.510672
date                   2021-12-16
Name: 299, dtype: object
name              Mitchell Robinson
mip_prediction             0.163516
date                     2021-12-16
Name: 300, dtype: object
name              Isaiah Roby
mip_prediction       0.003095
date               2021-12-16
Name: 301, dtype: object
name              Rajon Rondo
mip_prediction       0.002984
date               2021-12-16
Name: 302, dtype: object
name              Derrick Rose
mip_prediction         0.00865
date                2021-12-16
Name: 303, dtype: object
name             

In [117]:
#NaiveBayes
#=====your code here==========
from pyspark.ml.classification import NaiveBayes
nbModel_mvp = NaiveBayes(featuresCol="features", labelCol="mvpLabel").fit(trainingData)
nbModel_dpoy = NaiveBayes(featuresCol="features", labelCol="dpoyLabel").fit(trainingData)
#===============================

mvp_predictions = nbModel_mvp.transform(testData)
dpoy_predictions = nbModel_dpoy.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))

mvp_predictions = nbModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'probability', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = nbModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'probability', 'prediction'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

Test set accuracy for mvp prediction = 0.9824046920821115
Test set accuracy for dpoy prediction = 0.9765395894428153
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]


In [118]:
#Decision Tree
#=====your code here==========
from pyspark.ml.classification import DecisionTreeClassifier
dtModel_mvp = DecisionTreeClassifier(featuresCol="features", labelCol="mvpLabel").fit(trainingData)
dtModel_dpoy = DecisionTreeClassifier(featuresCol="features", labelCol="dpoyLabel").fit(trainingData)
#===============================

mvp_predictions = dtModel_mvp.transform(testData)
dpoy_predictions = dtModel_dpoy.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))

mvp_predictions = dtModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'probability', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = dtModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'probability', 'prediction', 'blk'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

Test set accuracy for mvp prediction = 0.9794721407624634
Test set accuracy for dpoy prediction = 0.9765395894428153
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]


In [119]:
#Gradient Boosting Trees
#=====your code here==========
from pyspark.ml.classification import GBTClassifier
gbtModel_mvp = GBTClassifier(featuresCol="features", labelCol="mvpLabel").fit(trainingData)
gbtModel_dpoy = GBTClassifier(featuresCol="features", labelCol="dpoyLabel").fit(trainingData)
#===============================

mvp_predictions = gbtModel_mvp.transform(testData)
dpoy_predictions = gbtModel_dpoy.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))

mvp_predictions = gbtModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'probability', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = gbtModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'probability', 'prediction', 'blk'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

                                                                                

Test set accuracy for mvp prediction = 0.9736070381231672
Test set accuracy for dpoy prediction = 0.9765395894428153
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]


In [120]:
# Multi-layer Perceptron
#=====your code here==========
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlpModel_mvp = MultilayerPerceptronClassifier(layers=[6, 5, 5, 2], seed=123, featuresCol="features", labelCol="mvpLabel").fit(trainingData)
mlpModel_dpoy = MultilayerPerceptronClassifier(layers=[6, 5, 5, 2], seed=123, featuresCol="features", labelCol="dpoyLabel").fit(trainingData)
#===============================

mvp_predictions = mlpModel_mvp.transform(testData)
dpoy_predictions = mlpModel_dpoy.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))

mvp_predictions = mlpModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'probability', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = mlpModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'probability', 'prediction', 'blk'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

Test set accuracy for mvp prediction = 0.9912023460410557
Test set accuracy for dpoy prediction = 0.9706744868035191
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, probability: vector, prediction: double]


In [123]:
# Linear Support Vector Machine
#=====your code here==========
from pyspark.ml.classification import LinearSVC
svmModel_mvp = LinearSVC(maxIter=10, regParam=0.1, labelCol="mvpLabel").fit(trainingData)
svmModel_dpoy = LinearSVC(maxIter=10, regParam=0.1, labelCol="dpoyLabel").fit(trainingData)
#===============================

mvp_predictions = svmModel_mvp.transform(testData)
dpoy_predictions = svmModel_dpoy.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="mvpLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mvp_predictions)
print("Test set accuracy for mvp prediction = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="dpoyLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dpoy_predictions)
print("Test set accuracy for dpoy prediction = " + str(accuracy))

mvp_predictions = svmModel_mvp.transform(ongoing_dataset)
mvp_selected = mvp_predictions.select(['name', 'prediction', 'blk'])
# mvp_selected.show(mvp_selected.count(), False)

dpoy_predictions = svmModel_dpoy.transform(ongoing_dataset)
dpoy_selected = dpoy_predictions.select(['name', 'prediction', 'blk'])
# dpoy_selected.show(dpoy_selected.count(), False)
print(dpoy_predictions)

Test set accuracy for mvp prediction = 0.9824046920821115
Test set accuracy for dpoy prediction = 0.9765395894428153
DataFrame[features: vector, id: string, name: string, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, rawPrediction: vector, prediction: double]


4. Comparison and analysis

In [34]:
# query to see if there is new request
from google.cloud import bigquery

In [58]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- label: long (nullable = true)
 |-- reb: double (nullable = true)
 |-- ast: double (nullable = true)
 |-- stl: double (nullable = true)
 |-- blk: double (nullable = true)
 |-- tov: double (nullable = true)
 |-- pts: double (nullable = true)
 |-- new_request: long (nullable = true)
 |-- percentage: double (nullable = true)



In [59]:
####################### data preprocessing #######################
stages = []
# Transform all features into a vector using VectorAssembler
numericCols = ["reb", "ast", "stl", "blk", "tov", "pts"]
assemblerInputs = numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)
preppedDataDF.take(3)

[Row(id='fa7a753f-e41b-4302-ad17-309a8cc18697', name='Isaiah Thomas', label=0, reb=1.7, ast=3.65, stl=0.325, blk=0.15, tov=1.925, pts=12.2, new_request=1, percentage=0.0, features=DenseVector([1.7, 3.65, 0.325, 0.15, 1.925, 12.2])),
 Row(id='7c477949-5f36-4580-b607-0de0fb9f9c2d', name='Pascal Siakam', label=1, reb=7.316666667, ast=3.45, stl=1.016666667, blk=0.883333333, tov=2.466666667, pts=22.85, new_request=1, percentage=0.0, features=DenseVector([7.3167, 3.45, 1.0167, 0.8833, 2.4667, 22.85])),
 Row(id='a7a0fa13-a4b6-43cf-b367-ae9099ccf92e', name='Elfrid Payton', label=0, reb=4.711111111, ast=7.177777778, stl=1.555555556, blk=0.377777778, tov=2.133333333, pts=9.977777778, new_request=1, percentage=0.0, features=DenseVector([4.7111, 7.1778, 1.5556, 0.3778, 2.1333, 9.9778]))]

In [61]:
models = [lrModel, rfModel, dtModel, nbModel, gbtModel, mlpModel, svmModel, ovrModel]
print(testData)
print(preppedDataDF)

DataFrame[features: vector, id: string, name: string, label: int, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double]
DataFrame[id: string, name: string, label: bigint, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, new_request: bigint, percentage: double, features: vector]


In [62]:
predictions = rfModel.transform(preppedDataDF)
print(predictions)
predictions.show(3)
selected = predictions.select(['name', 'probability', 'prediction', 'pts', 'new_request'])
selected.show(3)
# selected.show(selected.count(), False)

probability = predictions.select(['probability'])
probability.show(3)

DataFrame[id: string, name: string, label: bigint, reb: double, ast: double, stl: double, blk: double, tov: double, pts: double, new_request: bigint, percentage: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]
+--------------------+-------------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+--------------------+--------------------+--------------------+----------+
|                  id|         name|label|        reb|        ast|        stl|        blk|        tov|        pts|new_request|percentage|            features|       rawPrediction|         probability|prediction|
+--------------------+-------------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+--------------------+--------------------+--------------------+----------+
|fa7a753f-e41b-430...|Isaiah Thomas|    0|        1.7|       3.65|      0.325|       0.15|      1.925|       12.