In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .master("local") \
    .appName("NaiveBayes") \
    .getOrCreate()

In [2]:
# load the dataset
data = spark.read.csv("GROUP2.csv.gz", header=True, sep=',',inferSchema="true")
#renaming winPlaceClass with label
data = data.withColumn("winPlaceClass", col("winPlaceClass") -1)
data = data.withColumnRenamed('winPlaceClass', 'label')
data.dtypes

[('groupId', 'string'),
 ('matchId', 'string'),
 ('matchDuration', 'int'),
 ('label', 'int'),
 ('isFirstPerson', 'boolean'),
 ('matchtype', 'int'),
 ('maxPlace', 'int'),
 ('numGroups', 'int'),
 ('hasDisconnected', 'int'),
 ('assist_SUM', 'int'),
 ('assist_MAX', 'int'),
 ('assist_AVG', 'int'),
 ('heals_SUM', 'int'),
 ('heals_MAX', 'int'),
 ('heals_AVG', 'int'),
 ('kills_SUM', 'int'),
 ('kills_MAX', 'int'),
 ('kills_AVG', 'int'),
 ('headshotKills_SUM', 'int'),
 ('headshotKills_MAX', 'int'),
 ('headshotKills_AVG', 'int'),
 ('killStreaks_MAX', 'int'),
 ('roadKills_SUM', 'int'),
 ('roadKills_MAX', 'int'),
 ('roadKills_AVG', 'int'),
 ('longestKill_MAX', 'double'),
 ('vehicleDestroys_MAX', 'int'),
 ('weaponsAcquired_SUM', 'int'),
 ('weaponsAcquired_MAX', 'int'),
 ('weaponsAcquired_AVG', 'int'),
 ('damageDealt_SUM', 'double'),
 ('damageDealt_MAX', 'double'),
 ('damageDealt_AVG', 'double'),
 ('distance_SUM', 'double'),
 ('distance_MAX', 'double'),
 ('distance_AVG', 'double'),
 ('rideDistance_SU

In [3]:
inputCols =  data.columns

toRemove = ['matchId', 'groupId', 'killPlace_MAX', 'rankPoints_MAX', 'killPoints_MAX', 'winPoints_MAX', 'label']
inputCols = list(set(inputCols) - set(toRemove))

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="features")

output = assembler.transform(data)

## Feature Selection (correlation) 

In [5]:
pearsonCorr = Correlation.corr(output, 'features', 'pearson').collect()[0][0]
pearsonCorr

DenseMatrix(51, 51, [1.0, 0.0025, -0.0104, -0.0151, 0.0537, 0.0228, 0.0862, 0.0041, ..., 0.5481, 0.1078, 0.2851, 0.0371, 0.0846, 0.1527, 0.4719, 1.0], False)

In [6]:
corr_matrix = pd.DataFrame(pearsonCorr.toArray(), inputCols, inputCols)
corr_matrix

Unnamed: 0,hasDisconnected,vehicleDestroys_MAX,swimDistance_SUM,teamKills_AVG,killStreaks_MAX,headshotKills_SUM,DBNOs_MAX,roadKills_MAX,DBNOs_AVG,damageDealt_AVG,...,walkDistance_SUM,damageDealt_MAX,walkDistance_AVG,revives_AVG,longestKill_MAX,roadKills_SUM,swimDistance_MAX,assist_AVG,distance_MAX,weaponsAcquired_AVG
hasDisconnected,1.0,0.002518,-0.010435,-0.015142,0.053696,0.022824,0.086226,0.004055,0.019487,-0.028483,...,0.014047,-0.006484,-0.097397,-0.019546,-0.035805,0.00424,-0.010753,-0.045099,-0.10154,0.035542
vehicleDestroys_MAX,0.002518,1.0,0.024193,0.074378,0.093838,0.092363,0.119398,0.039587,0.100984,0.099075,...,0.151613,0.124135,0.112089,0.052277,0.137596,0.040525,0.020156,0.100961,0.18092,0.083509
swimDistance_SUM,-0.010435,0.024193,1.0,-0.002773,0.058686,0.053817,0.061226,0.009951,0.042262,0.041128,...,0.198349,0.061319,0.186768,0.023243,0.089566,0.009774,0.913777,0.033421,0.157252,0.078904
teamKills_AVG,-0.015142,0.074378,-0.002773,1.0,-0.02713,-0.02126,-0.010323,0.010579,0.023327,-0.012585,...,-0.02408,-0.029314,0.008225,0.022643,-0.020032,0.01023,0.000117,-0.011194,0.036415,0.027019
killStreaks_MAX,0.053696,0.093838,0.058686,-0.02713,1.0,0.529053,0.644243,0.070194,0.555745,0.614954,...,0.409322,0.701051,0.336271,0.181865,0.497506,0.069784,0.048411,0.266539,0.276195,0.234547
headshotKills_SUM,0.022824,0.092363,0.053817,-0.02126,0.529053,1.0,0.56086,0.026847,0.529985,0.597628,...,0.389743,0.659826,0.32487,0.178785,0.504682,0.027263,0.045777,0.320007,0.262392,0.200763
DBNOs_MAX,0.086226,0.119398,0.061226,-0.010323,0.644243,0.56086,1.0,0.044246,0.833345,0.530731,...,0.499249,0.748322,0.294908,0.282188,0.477852,0.045374,0.043349,0.343222,0.279209,0.158517
roadKills_MAX,0.004055,0.039587,0.009951,0.010579,0.070194,0.026847,0.044246,1.0,0.041025,0.062318,...,0.03866,0.063312,0.035849,0.011997,0.04528,0.991507,0.010051,0.028044,0.114871,0.037335
DBNOs_AVG,0.019487,0.100984,0.042262,0.023327,0.555745,0.529985,0.833345,0.041025,1.0,0.619111,...,0.368877,0.694145,0.300072,0.310427,0.440763,0.042097,0.034219,0.385038,0.256316,0.189896
damageDealt_AVG,-0.028483,0.099075,0.041128,-0.012585,0.614954,0.597628,0.530731,0.062318,0.619111,1.0,...,0.3386,0.902205,0.439518,0.218835,0.554377,0.061928,0.042589,0.408732,0.337736,0.372559


In [7]:
# Seleziono il triangolo superiore della correlation matrix
upper = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Cerco le features con correlazione > 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

In [8]:
to_drop

['rideDistance_MAX',
 'assist_SUM',
 'rideDistance_AVG',
 'revives_SUM',
 'maxPlace',
 'kills_MAX',
 'kills_AVG',
 'damageDealt_SUM',
 'headshotKills_MAX',
 'teamKills_MAX',
 'matchtype',
 'DBNOs_SUM',
 'heals_MAX',
 'heals_AVG',
 'damageDealt_MAX',
 'walkDistance_AVG',
 'roadKills_SUM',
 'swimDistance_MAX',
 'distance_MAX']

In [9]:
naiveBayesFeat = [col for col in inputCols if col not in to_drop]
naiveBayesFeat

['hasDisconnected',
 'vehicleDestroys_MAX',
 'swimDistance_SUM',
 'teamKills_AVG',
 'killStreaks_MAX',
 'headshotKills_SUM',
 'DBNOs_MAX',
 'roadKills_MAX',
 'DBNOs_AVG',
 'damageDealt_AVG',
 'assist_MAX',
 'matchDuration',
 'roadKills_AVG',
 'isFirstPerson',
 'rideDistance_SUM',
 'revives_MAX',
 'kills_SUM',
 'swimDistance_AVG',
 'teamKills_SUM',
 'heals_SUM',
 'distance_AVG',
 'numGroups',
 'weaponsAcquired_MAX',
 'distance_SUM',
 'weaponsAcquired_SUM',
 'headshotKills_AVG',
 'walkDistance_MAX',
 'walkDistance_SUM',
 'revives_AVG',
 'longestKill_MAX',
 'assist_AVG',
 'weaponsAcquired_AVG']

In [10]:
assembler = VectorAssembler(
    inputCols=naiveBayesFeat,
    outputCol="features")

output = assembler.transform(data)

In [11]:
seed = 42
withReplacement=False
output = output.sample(withReplacement, 0.5, seed).select("features", "label")

(training,testing) = output.randomSplit([0.7,0.3])
print(F'training: {training.count()}')
print(F'testing: {testing.count()}')

training: 706291
testing: 302236


In [12]:
training.groupBy('label').count().toPandas()

Unnamed: 0,label,count
0,1,165330
1,3,174326
2,5,16619
3,4,168000
4,2,165522
5,0,16494


## Grid Search

paramGrid = ParamGridBuilder() .addGrid(nb.smoothing, [0.0, 1.0, 5.0])  \
    .addGrid(nb.modelType, ['multinomial', 'gaussian']) \
    .build()

In [13]:
nb = NaiveBayes()

paramGrid = ParamGridBuilder() .addGrid(nb.smoothing, [0.0, 0.001, 0.01, 1.0])  \
    .addGrid(nb.modelType, ['multinomial', 'gaussian']) \
    .build()

crossval = CrossValidator(estimator=nb,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3) 

cvModel = crossval.fit(training)

In [14]:
# Best parameters
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='NaiveBayes_81fdf52c1ab4', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.0, Param(parent='NaiveBayes_81fdf52c1ab4', name='modelType', doc='The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian.'): 'gaussian'}


Individuati i parametri, si fa training sull'intero dataset

In [17]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="features")

output = assembler.transform(data)

seed = 42
withReplacement=False
output = output.sample(withReplacement, 1.0, seed).select("features", "label")

(training,testing) = output.randomSplit([0.7,0.3])
print(F'training: {training.count()}')
print(F'testing: {testing.count()}')

training: 1409918
testing: 604172


In [18]:
nb = NaiveBayes(modelType='gaussian', smoothing=0.0)
model = nb.fit(training)

In [19]:
predictions = model.transform(testing)

## Evaluation

In [20]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = predictions.rdd.map(lambda x: (x.prediction, float(x.label)))
metrics = MulticlassMetrics(predictionAndLabels)

In [21]:
# Summary stats
print("Recall = %s" % metrics.weightedRecall)
print("Precision = %s" % metrics.weightedPrecision)
print("F1 measure = %s" % metrics.weightedFMeasure())
print("Accuracy = %s" % metrics.accuracy)

# Individual label stats
labels = [0.0, 1.0 ,2.0, 3.0, 4.0, 5.0]
for label in labels:
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label)))
    
print('Confusion Matrix')
print(metrics.confusionMatrix().toArray())

Recall = 0.33921797104135903
Precision = 0.4166619053998646
F1 measure = 0.35815636844335635
Accuracy = 0.3392179710413591
Class 0.0 precision = 0.24013948696272422
Class 0.0 recall = 0.6462851048702453
Class 0.0 F1 Measure = 0.35016757194036746
Class 1.0 precision = 0.6264447638256501
Class 1.0 recall = 0.3687990159481809
Class 1.0 F1 Measure = 0.46427260765657313
Class 2.0 precision = 0.4037536004666934
Class 2.0 recall = 0.31250661417656145
Class 2.0 F1 Measure = 0.3523179650744286
Class 3.0 precision = 0.343872692675172
Class 3.0 recall = 0.28659087411253786
Class 3.0 F1 Measure = 0.31262956582673945
Class 4.0 precision = 0.34812008022259394
Class 4.0 recall = 0.3049313965114987
Class 4.0 F1 Measure = 0.325097625525142
Class 5.0 precision = 0.105170817291291
Class 5.0 recall = 0.9077397644419518
Class 5.0 F1 Measure = 0.18850179795891628
Confusion Matrix
[[9.0900e+03 4.0450e+03 7.6000e+02 9.1000e+01 1.0000e+00 7.8000e+01]
 [2.1863e+04 5.1869e+04 4.7992e+04 1.8061e+04 7.9900e+02 5.9

## Sklearn Evalutation

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

In [23]:
y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

In [24]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.65      0.35     14065
           1       0.63      0.37      0.46    140643
           2       0.40      0.31      0.35    141741
           3       0.34      0.29      0.31    149443
           4       0.35      0.30      0.33    144016
           5       0.11      0.91      0.19     14264

    accuracy                           0.34    604172
   macro avg       0.34      0.47      0.33    604172
weighted avg       0.42      0.34      0.36    604172



In [25]:
print(confusion_matrix(y_true, y_pred))

[[ 9090  4045   760    91     1    78]
 [21863 51869 47992 18061   799    59]
 [ 5715 21736 44295 52644 16316  1035]
 [ 1059  4673 14818 42829 64053 22011]
 [  102   471  1818 10727 43915 86983]
 [   24     5    25   197  1065 12948]]


In [25]:
spark.stop()