In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("AppName") \
    .getOrCreate()

In [2]:
# load the dataset
data = spark.read.csv("GROUP.csv.gz", header=True, sep=',',inferSchema="true")
data.dtypes

[('groupId', 'string'),
 ('matchId', 'string'),
 ('matchDuration', 'int'),
 ('winPlaceClass', 'int'),
 ('isFirstPerson', 'boolean'),
 ('matchtype', 'int'),
 ('maxPlace', 'int'),
 ('numGroups', 'int'),
 ('assist_SUM', 'int'),
 ('assist_AVG', 'int'),
 ('heals_SUM', 'int'),
 ('heals_AVG', 'int'),
 ('kills_SUM', 'int'),
 ('kills_AVG', 'int'),
 ('headshotKills_SUM', 'int'),
 ('headshotKills_AVG', 'int'),
 ('killStreaks_MAX', 'int'),
 ('roadKills_SUM', 'int'),
 ('roadKills_AVG', 'int'),
 ('longestKill_MAX', 'double'),
 ('vehicleDestroys_MAX', 'int'),
 ('weaponsAcquired_SUM', 'int'),
 ('weaponsAcquired_AVG', 'int'),
 ('damageDealt_SUM', 'double'),
 ('damageDealt_AVG', 'double'),
 ('distance_SUM', 'double'),
 ('distance_AVG', 'double'),
 ('rideDistance_SUM', 'double'),
 ('rideDistance_AVG', 'double'),
 ('swimDistance_SUM', 'double'),
 ('swimDistance_AVG', 'double'),
 ('walkDistance_SUM', 'double'),
 ('walkDistance_AVG', 'double'),
 ('DBNOs_SUM', 'int'),
 ('DBNOs_AVG', 'int'),
 ('revives_SUM', 

In [82]:
data.columns

['groupId',
 'matchId',
 'matchDuration',
 'winPlaceClass',
 'isFirstPerson',
 'matchtype',
 'maxPlace',
 'numGroups',
 'assist_SUM',
 'assist_AVG',
 'heals_SUM',
 'heals_AVG',
 'kills_SUM',
 'kills_AVG',
 'headshotKills_SUM',
 'headshotKills_AVG',
 'killStreaks_MAX',
 'roadKills_SUM',
 'roadKills_AVG',
 'longestKill_MAX',
 'vehicleDestroys_MAX',
 'weaponsAcquired_SUM',
 'weaponsAcquired_AVG',
 'damageDealt_SUM',
 'damageDealt_AVG',
 'distance_SUM',
 'distance_AVG',
 'rideDistance_SUM',
 'rideDistance_AVG',
 'swimDistance_SUM',
 'swimDistance_AVG',
 'walkDistance_SUM',
 'walkDistance_AVG',
 'DBNOs_SUM',
 'DBNOs_AVG',
 'revives_SUM',
 'revives_AVG',
 'teamKills_SUM',
 'teamKills_AVG',
 'killPlace_MAX',
 'rankPoints_MAX',
 'killPoints_MAX',
 'winPoints_MAX']

In [18]:
inputCols =  ['matchDuration', 
 'isFirstPerson',
 'matchtype',
 'maxPlace',
 'numGroups',
 'assist_SUM',
 'assist_AVG',
 'heals_SUM',
 'heals_AVG',
 'kills_SUM',
 'kills_AVG',
 'headshotKills_SUM',
 'headshotKills_AVG',
 'killStreaks_MAX',
 'roadKills_SUM',
 'roadKills_AVG',
 'longestKill_MAX',
 'vehicleDestroys_MAX',
 'weaponsAcquired_SUM',
 'weaponsAcquired_AVG',
 'damageDealt_SUM',
 'damageDealt_AVG',
 'distance_SUM',
 'distance_AVG',
 'rideDistance_SUM',
 'rideDistance_AVG',
 'swimDistance_SUM',
 'swimDistance_AVG',
 'walkDistance_SUM',
 'walkDistance_AVG',
 'DBNOs_SUM',
 'DBNOs_AVG',
 'revives_SUM',
 'revives_AVG',
 'teamKills_SUM',
 'teamKills_AVG',
 'killPlace_MAX',
 'rankPoints_MAX',
 'killPoints_MAX',
 'winPoints_MAX']

In [25]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# winPlaceClass

assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="features")

output = assembler.transform(data)

In [26]:
seed = 42
withReplacement=False
output = output.sample(withReplacement, 0.1, seed).select("features", "winPlaceClass")

(training,testing) = output.randomSplit([0.7,0.3])
print(F'training: {training.count()}')
print(F'testing: {testing.count()}')

training: 141432
testing: 60000


In [30]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(maxDepth=5, labelCol="winPlaceClass",
                            featuresCol="features", impurity="gini",
                            seed=42)
model = dt.fit(training)

In [94]:
predictions = model.transform(testing)
predictions.show()

+--------------------+-------------+--------------------+--------------------+----------+
|            features|winPlaceClass|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|(40,[0,1,2,3,4,5,...|            2|[0.0,60.0,2672.0,...|[0.0,0.0089512158...|       3.0|
|(40,[0,1,2,3,4,5,...|            2|[0.0,3087.0,23511...|[0.0,0.0969961666...|       2.0|
|(40,[0,1,2,3,4,5,...|            1|[0.0,3087.0,23511...|[0.0,0.0969961666...|       2.0|
|(40,[0,1,2,3,4,5,...|            2|[0.0,3087.0,23511...|[0.0,0.0969961666...|       2.0|
|(40,[0,1,2,3,4,5,...|            3|[0.0,1.0,35.0,141...|[0.0,1.2680699974...|       4.0|
|(40,[0,1,2,3,4,5,...|            2|[0.0,60.0,2672.0,...|[0.0,0.0089512158...|       3.0|
|(40,[0,1,2,3,4,5,...|            2|[0.0,3087.0,23511...|[0.0,0.0969961666...|       2.0|
|(40,[0,1,2,3,4,5,...|            2|[0.0,3087.0,23511...|[0.0,0.0969961666...|       2.0|
|(40,[0,1,

## Evaluation

In [95]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = predictions.rdd.map(lambda x: (x.prediction, float(x.winPlaceClass)))
metrics = MulticlassMetrics(predictionAndLabels)

In [115]:
labels = [1,2,3,4,5,6]
# Summary stats
print("Recall = %s" % metrics.weightedRecall)
print("Precision = %s" % metrics.weightedPrecision)
print("F1 measure = %s" % metrics.weightedFMeasure())
print("Accuracy = %s" % metrics.accuracy)

# Individual label stats
labels = [1.0, 2.0 ,3.0, 4.0, 5.0, 6.0]
for label in labels:
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label)))
    
print('Confusion Matrix')
print(metrics.confusionMatrix().toArray())

Recall = 0.7353666666666667
Precision = 0.7200817833760604
F1 measure = 0.7250574298966894
Accuracy = 0.7353666666666666
Class 1.0 precision = 0.0
Class 1.0 recall = 0.0
Class 1.0 F1 Measure = 0.0
Class 2.0 precision = 0.7100576874843241
Class 2.0 recall = 0.8111747851002865
Class 2.0 F1 Measure = 0.7572555837902903
Class 3.0 precision = 0.6663277278060227
Class 3.0 recall = 0.6009876543209877
Class 3.0 F1 Measure = 0.6319732937685459
Class 4.0 precision = 0.7028687302439315
Class 4.0 recall = 0.8039978168917997
Class 4.0 F1 Measure = 0.7500397772474144
Class 5.0 precision = 0.8711461867532084
Class 5.0 recall = 0.8195253955037469
Class 5.0 F1 Measure = 0.844547729710404
Class 6.0 precision = 0.7250530785562632
Class 6.0 recall = 0.5003663003663004
Class 6.0 F1 Measure = 0.5921109666233203
Confusion Matrix
[[0.0000e+00 1.3630e+03 5.9000e+01 3.0000e+00 5.0000e+00 0.0000e+00]
 [0.0000e+00 1.1324e+04 2.4710e+03 1.6300e+02 2.0000e+00 0.0000e+00]
 [0.0000e+00 3.1430e+03 8.5190e+03 2.4750e+0

In [57]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="winPlaceClass",
                                              predictionCol="prediction")
evaluator.setMetricName('f1')
f1 = evaluator.evaluate(predictions)

#print("Test Error = %g " % (1.0 - accuracy))
print("F1 = %g " % f1)

F1 = 0.725057 


In [38]:
evaluator.setMetricName('accuracy')
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g " % accuracy)

accuracy = 0.735367 


In [62]:
metrics = ['f1', 'accuracy',  'weightedPrecision',  'weightedRecall',  'weightedTruePositiveRate', 
          'weightedFalsePositiveRate',  'weightedFMeasure', 'truePositiveRateByLabel',
          'falsePositiveRateByLabel', 'precisionByLabel',  'recallByLabel',  'fMeasureByLabel',  'logLoss',  'hammingLoss']

byLabel = ['truePositiveRateByLabel','falsePositiveRateByLabel', 'precisionByLabel',  'recallByLabel',  'fMeasureByLabel']
labels = [1,2,3,4,5,6]

for metric in metrics:
    print(metric)
    evaluator.setMetricName(metric)
    
    if metric in byLabel:
        for c in  labels:
            evaluator.setMetricLabel(c)
            print(F'{c} = {evaluator.evaluate(predictions)}')
    else:
        print(evaluator.evaluate(predictions))
    print('#########################################################')

f1
0.7250574298966894
#########################################################
accuracy
0.7353666666666666
#########################################################
weightedPrecision
0.7200817833760604
#########################################################
weightedRecall
0.7353666666666667
#########################################################
weightedTruePositiveRate
0.7353666666666667
#########################################################
weightedFalsePositiveRate
0.08150905942758307
#########################################################
weightedFMeasure
0.7250574298966894
#########################################################
truePositiveRateByLabel
1 = 0.0
2 = 0.8111747851002865
3 = 0.6009876543209877
4 = 0.8039978168917997
5 = 0.8195253955037469
6 = 0.5003663003663004
#########################################################
falsePositiveRateByLabel
1 = 0.0
2 = 0.10043440486533449
3 = 0.09309328968903437
4 = 0.10987605310749415
5 = 0.03832148811090638
6 = 0.0044171

KeyboardInterrupt: 

In [None]:
# (f1 | accuracy | weightedPrecision | weightedRecall | weightedTruePositiveRate | 
#  weightedFalsePositiveRate | weightedFMeasure | truePositiveRateByLabel | 
# falsePositiveRateByLabel | precisionByLabel | recallByLabel | fMeasureByLabel | 
# logLoss | hammingLoss

## Sklearn metrics

In [48]:
y_true = predictions.select(['winPlaceClass']).collect()
y_pred = predictions.select(['prediction']).collect()

In [49]:
from sklearn.metrics import classification_report, confusion_matrix

In [50]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      1430
           2       0.71      0.81      0.76     13960
           3       0.67      0.60      0.63     14175
           4       0.70      0.80      0.75     14658
           5       0.87      0.82      0.84     14412
           6       0.73      0.50      0.59      1365

    accuracy                           0.74     60000
   macro avg       0.61      0.59      0.60     60000
weighted avg       0.72      0.74      0.73     60000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
print(confusion_matrix(y_true, y_pred))

[[    0  1363    59     3     5     0]
 [    0 11324  2471   163     2     0]
 [    0  3143  8519  2475    38     0]
 [    0   115  1725 11785  1033     0]
 [    0     1    11  2330 11811   259]
 [    0     2     0    11   669   683]]
