In [1]:
from pyspark.sql import SparkSession

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("DDAM Project") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/27 13:20:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_boxscore = spark.read.csv("data/df_boxscore_2000-2010/part-00000-bfbd9569-0e5c-4121-89f3-9b2a4be1e464-c000.csv", header=True, inferSchema=True)
df_salaries = spark.read.csv("data/df_salaries_2000-2010/part-00000-be531411-0288-4387-9dea-5d34d5157858-c000.csv", header=True, inferSchema=True)

                                                                                

In [5]:
df_boxscore.show(5)

+-------+-------------------+-------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---------+---------------+---------+----------+
|game_id|           teamName|   playerName| FG|FGA| 3P|3PA| FT|FTA|ORB|DRB|TRB|AST|STL|BLK|TOV| PF|PTS|+/-|isStarter|seasonStartYear|isRegular|MP_seconds|
+-------+-------------------+-------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---------+---------------+---------+----------+
|   4577|Seattle SuperSonics|  Gary Payton| 10| 26|  2|  9|  5|  7|  4| 10| 14| 10|  1|  0|  6|  4| 27| -6|        1|           2000|        1|      2880|
|   4577|Seattle SuperSonics|    Vin Baker|  7| 16|  0|  0|  5|  6|  2|  5|  7|  1|  1|  4|  3|  3| 19|  0|        1|           2000|        1|      2248|
|   4577|Seattle SuperSonics|Patrick Ewing|  1|  6|  0|  0|  0|  0|  2| 10| 12|  2|  0|  2|  3|  4|  2|-15|        1|           2000|        1|      2196|
|   4577|Seattle SuperSonics|Rashard Lewis|  3| 10|  2|  5|  3|  4|  1

In [6]:
df_salaries.show(5)

+----------------+---------------+-----------+------------------+
|      playerName|seasonStartYear|     salary|inflationAdjSalary|
+----------------+---------------+-----------+------------------+
|   Kevin Garnett|           2000|$19,610,000|       $30,904,632|
|Shaquille O'Neal|           2000|$19,285,715|       $30,393,570|
| Alonzo Mourning|           2000|$16,880,000|       $26,602,253|
|    Juwan Howard|           2000|$16,875,000|       $26,594,373|
| Hakeem Olajuwon|           2000|$16,700,000|       $26,318,580|
+----------------+---------------+-----------+------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

col = ["FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "+/-", "isStarter", "seasonStartYear", "isRegular", "MP_seconds"]

assembler = VectorAssembler(
    inputCols=col,
    outputCol="features")

output_dataset = assembler.transform(df_boxscore)

scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")

output_dataset = scaler.fit(output_dataset).transform(output_dataset)

clusteringData = output_dataset.select("features_scaled", "playerName")

clusteringData.show(truncate=False)

23/11/27 13:29:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|features_scaled                                                                                                                                                                                                                                                                                                 |playerName         |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|[0.357142857142857

In [12]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans(featuresCol="features_scaled").setK(3)

model = kmeans.fit(clusteringData)

model

                                                                                

KMeansModel: uid=KMeans_b7789a2e2942, k=3, distanceMeasure=euclidean, numFeatures=20

In [13]:
# Make predictions
predictions = model.transform(clusteringData)

predictions.show()

+--------------------+-------------------+----------+
|     features_scaled|         playerName|prediction|
+--------------------+-------------------+----------+
|[0.35714285714285...|        Gary Payton|         0|
|[0.25,0.340425531...|          Vin Baker|         0|
|[0.03571428571428...|      Patrick Ewing|         0|
|[0.10714285714285...|      Rashard Lewis|         0|
|[0.10714285714285...|      Desmond Mason|         0|
|[0.21428571428571...|  Shammond Williams|         2|
|[0.07142857142857...|    Ruben Patterson|         2|
|(20,[1,6,7,8,11,1...|   Rubén Wolkowyski|         2|
|(20,[7,8,9,15,18,...|       Jelani McCoy|         2|
|[0.42857142857142...|Shareef Abdur-Rahim|         0|
|[0.25,0.361702127...|         Mike Bibby|         0|
|[0.32142857142857...|  Michael Dickerson|         0|
|[0.03571428571428...|      Bryant Reeves|         0|
|[0.07142857142857...| Othella Harrington|         0|
|[0.07142857142857...|         Grant Long|         2|
|(20,[0,1,10,12,13...|      

In [14]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol="features_scaled")
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

[Stage 81:>                                                         (0 + 1) / 1]

Silhouette with squared euclidean distance = 0.4424780310849441


                                                                                