# <font color='blue'>Machine Learning with PySpark - Predicting the Outcome of Soccer Matches</font>

## <font color='blue'>Multiclass Classification</font>

We will use Multiclass Classification with Decision Trees to build a model capable of predicting the outcome of a soccer match with 3 possible outcomes: victory, defeat or draw.

In [1]:
# Import findspark and initialize
import findspark
findspark.init()

In [2]:
# Imports 
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Loading the Data

In [3]:
# Creating the Spark Context
sc = SparkContext(appName = "Soccer-Outcome")

In [4]:
sc.setLogLevel("ERROR")

In [5]:
# Spark Session - used when working with Dataframes in Spark
spSession = SparkSession.builder.master("local").getOrCreate()

In [6]:
# Loading the data and generating an RDD
data_soccer_team = sc.textFile("data/dataset2.csv")

In [7]:
# Caching the RDD. This process optimizes performance.
data_soccer_team.cache()

data/dataset2.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
data_soccer_team.count()

151

In [9]:
data_soccer_team.take(5)

['average_faults_suffered,average_faults_received,average_cards_received,average_kicks_a_goal,result',
 '4.8,3,1.4,0.3,victory',
 '5.1,3.8,1.6,0.2,victory',
 '4.6,3.2,1.4,0.2,victory',
 '5.3,3.7,1.5,0.2,victory']

In [10]:
# Removendo a primeira linha do arquivo (cabeçalho)
data_soccer_team_2 = data_soccer_team.filter(lambda x: "average_faults_received" not in x)
data_soccer_team_2.count()

150

## Data Cleaning and Transformation

In [11]:
# Spliting the columns
data_soccer_team_3 = data_soccer_team_2.map(lambda l: l.split(","))

In [12]:
# Mapping the columns
data_soccer_team_4 = data_soccer_team_3.map(lambda p: Row(average_faults_suffered = float(p[0]), 
                                                              average_faults_received = float(p[1]), 
                                                              average_cards_received = float(p[2]), 
                                                              average_kicks_a_goal = float(p[3]), 
                                                              result = p[4] ))

In [13]:
# Converte o RDD para DataFrame do Spark
df_team = spSession.createDataFrame(data_soccer_team_4)
df_team.cache()

DataFrame[average_faults_suffered: double, average_faults_received: double, average_cards_received: double, average_kicks_a_goal: double, result: string]

In [14]:
df_team.take(5)

[Row(average_faults_suffered=4.8, average_faults_received=3.0, average_cards_received=1.4, average_kicks_a_goal=0.3, result='victory'),
 Row(average_faults_suffered=5.1, average_faults_received=3.8, average_cards_received=1.6, average_kicks_a_goal=0.2, result='victory'),
 Row(average_faults_suffered=4.6, average_faults_received=3.2, average_cards_received=1.4, average_kicks_a_goal=0.2, result='victory'),
 Row(average_faults_suffered=5.3, average_faults_received=3.7, average_cards_received=1.5, average_kicks_a_goal=0.2, result='victory'),
 Row(average_faults_suffered=5.1, average_faults_received=3.5, average_cards_received=1.4, average_kicks_a_goal=0.2, result='victory')]

In [15]:
# Creating a numeric index for the target label column
stringIndexer = StringIndexer(inputCol = "result", outputCol = "idx_result")

In [16]:
# Training the string indexer
si_model = stringIndexer.fit(df_team)

In [17]:
# Applying the string indexer
df_team_final = si_model.transform(df_team)

In [18]:
df_team_final.select("result", "idx_result").distinct().collect()

[Row(result='victory', idx_result=2.0),
 Row(result='defeat', idx_result=0.0),
 Row(result='draw', idx_result=1.0)]

## Exploratory Data Analysis

In [19]:
# Descriptive statistics
df_team_final.describe().show()

+-------+-----------------------+-----------------------+----------------------+--------------------+-------+------------------+
|summary|average_faults_suffered|average_faults_received|average_cards_received|average_kicks_a_goal| result|        idx_result|
+-------+-----------------------+-----------------------+----------------------+--------------------+-------+------------------+
|  count|                    150|                    150|                   150|                 150|    150|               150|
|   mean|      5.843333333333332|     3.0573333333333337|    3.7580000000000005|  1.1993333333333336|   null|               1.0|
| stddev|     0.8280661279778632|    0.43586628493669793|     1.765298233259467|  0.7622376689603465|   null|0.8192319205190404|
|    min|                    4.3|                    2.0|                   1.0|                 0.1| defeat|               0.0|
|    max|                    7.9|                    4.4|                   6.9|                 

In [20]:
# Correlação entre as variáveis
for i in df_team_final.columns:
    if not(isinstance(df_team_final.select(i).take(1)[0][0], str)) :
        print("Correlation of the variable idx_result with:", i, df_team_final.stat.corr('idx_result', i)) 

Correlation of the variable idx_result with: average_faults_suffered -0.4600391565002375
Correlation of the variable idx_result with: average_faults_received 0.6183715308237435
Correlation of the variable idx_result with: average_cards_received -0.6492418307641739
Correlation of the variable idx_result with: average_kicks_a_goal -0.5803770334306263
Correlation of the variable idx_result with: idx_result 1.0


## Data Pre-Processing

In [21]:
# Creating a LabeledPoint (target, Vector[features])
# Remove columns not relevant to the model or with low correlation
def transformVar(row) :
    obj = (row["result"], row["idx_result"], Vectors.dense([row["average_faults_suffered"], 
                                                                  row["average_faults_suffered"],
                                                                  row["average_faults_suffered"], 
                                                                  row["average_kicks_a_goal"]]))
    return obj

In [22]:
# Apply the function
df_team_final_RDD = df_team_final.rdd.map(transformVar)

In [23]:
df_team_final_RDD.take(5)

[('victory', 2.0, DenseVector([4.8, 4.8, 4.8, 0.3])),
 ('victory', 2.0, DenseVector([5.1, 5.1, 5.1, 0.2])),
 ('victory', 2.0, DenseVector([4.6, 4.6, 4.6, 0.2])),
 ('victory', 2.0, DenseVector([5.3, 5.3, 5.3, 0.2])),
 ('victory', 2.0, DenseVector([5.1, 5.1, 5.1, 0.2]))]

In [24]:
# Convert RDD to DataFrame
df_spark = spSession.createDataFrame(df_team_final_RDD, ["result", "label", "features"])

In [25]:
df_spark.cache()

DataFrame[result: string, label: double, features: vector]

In [26]:
df_spark.select("result", "label", "features").show(10)

+-------+-----+-----------------+
| result|label|         features|
+-------+-----+-----------------+
|victory|  2.0|[4.8,4.8,4.8,0.3]|
|victory|  2.0|[5.1,5.1,5.1,0.2]|
|victory|  2.0|[4.6,4.6,4.6,0.2]|
|victory|  2.0|[5.3,5.3,5.3,0.2]|
|victory|  2.0|[5.1,5.1,5.1,0.2]|
|victory|  2.0|[4.9,4.9,4.9,0.2]|
|victory|  2.0|[4.7,4.7,4.7,0.2]|
|victory|  2.0|[4.6,4.6,4.6,0.2]|
|victory|  2.0|[5.0,5.0,5.0,0.2]|
|victory|  2.0|[5.4,5.4,5.4,0.4]|
+-------+-----+-----------------+
only showing top 10 rows



In [27]:
# Training and Test Data
(training_data, test_data) = df_spark.randomSplit([0.7, 0.3])

In [28]:
training_data.count()

95

In [29]:
test_data.count()

55

## Machine Learning

In [30]:
# Create the object
dtClassifer = DecisionTreeClassifier(maxDepth = 2, labelCol = "label", featuresCol = "features")

In [31]:
# Train the object with data to create the model
model = dtClassifer.fit(training_data)

In [32]:
# Hyperparameter set by default
model.numNodes

5

In [33]:
# Hyperparameter defined by us
model.depth

2

In [34]:
# Predictions with test data
forecasts = model.transform(test_data)

In [35]:
forecasts

DataFrame[result: string, label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [36]:
forecasts.select("result", "label", "prediction", "probability").collect()

[Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='defeat', label=0.0, prediction=0.0, probability=DenseVector([0.9118, 0.0882, 0.0])),
 Row(result='victory', label=2.0, prediction=2.0, probability=DenseVector([0.0, 0.0, 1.0])),
 Row(result='victory', label=2.0, prediction=2.0, probability=DenseVector([0.0, 0.0, 1.0])),
 Row(result='victory', label=2.0, prediction=2.0, probability=DenseVector([0.0, 0.0, 1.0])),
 Row(result='victory', label=2.0, p

In [37]:
# Evaluating the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label", 
                                              metricName = "accuracy")

In [38]:
evaluator.evaluate(forecasts)      

0.9636363636363636

In [39]:
# Summarizing predictions - Confusion Matrix
forecasts.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       2.0|   19|
|  0.0|       0.0|   18|
|  1.0|       1.0|   16|
|  1.0|       0.0|    2|
+-----+----------+-----+



## Disclaimer: 
A good part of this project was largely done in the Data Science Academy, Big Data Real-Time Analytics with Python and Spark course (part of the Data Scientist training)

# End