## **Install PySpark**

In [None]:
!pip install pyspark
!pip install findspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1 



## **Import all the useful packages**

In [None]:
from tqdm import tqdm

import requests
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as f
from pyspark.sql.functions import split, regexp_replace, year, month, dayofmonth, to_timestamp

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Bucketizer
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator #, BinaryClassificationEvaluator 

# Basic libreries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## **Configuration of PySpark and check**

In [None]:
# Create the session
conf = SparkConf().set("spark.ui.port", "4050").set('spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [None]:
spark

## **Data Acquisition**

### Mount Google Drive point

In [None]:
GDRIVE_DIR = "/content/gdrive" # Your own mount point on Google Drive
GDRIVE_HOME_DIR = GDRIVE_DIR + "/My Drive" # Your own home directory
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR +  "/Sapienza/Primo Anno/Big Data Computing/Project"

# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

DATASET_URL = "https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv"
GDRIVE_DATASET_FILE = GDRIVE_DATA_DIR + "/" + DATASET_URL.split("/")[-1]

Mounted at /content/gdrive


### Retrieving Dataset and stored it

In [None]:
import requests

"""
This function downloads a file from a specific URL directly to Google Drive.
"""
def get_data(dataset_url, dest, chunk_size=1024):
  response = requests.get(dataset_url, stream=True)
  if response.status_code == 200: # Test if everything went ok
    with open(dest, "wb") as file:
      for block in response.iter_content(chunk_size=chunk_size): 
        if block: 
          file.write(block)

In [None]:
print("Retrieving dataset from URL: {} ...".format(DATASET_URL))
get_data(DATASET_URL, GDRIVE_DATASET_FILE)
print("Dataset successfully retrieved and stored at: {}".format(GDRIVE_DATASET_FILE))

Retrieving dataset from URL: https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv ...
Dataset successfully retrieved and stored at: /content/gdrive/My Drive/Sapienza/Primo Anno/Big Data Computing/Project/dataframe.csv


### Read DataSet file into PySpark dataframe

In [None]:
spotify_tracks = spark.read.load(GDRIVE_DATASET_FILE, 
                           format="csv", 
                           sep=";", 
                           inferSchema="true", 
                           header="true")

In [None]:
spotify_tracks.cache()

DataFrame[id: string, track_name: string, track_explicit: boolean, track_popularity: int, album_name: string, album_release_date: string, album_release_date_precision: string, artist_name: string, audio_avg_pitches: string, audio_avg_timbre: string, audio_acousticness: double, audio_danceability: double, audio_duration_ms: int, audio_energy: double, audio_instrumentalness: double, audio_key_1: int, audio_liveness: double, audio_loudness: double, audio_mode_1: int, audio_speechiness: double, audio_tempo: double, audio_time_signature: int, audio_valence: double, track_uri: string, track_genre: string]

### Check the shape of the loaded dataset, i.e., number of rows and columns

In [None]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(spotify_tracks.count(), len(spotify_tracks.columns)))

The shape of the dataset is 79339 rows by 25 columns


### Print out the schema of the loaded dataset

In [None]:
spotify_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_avg_pitches: string (nullable = true)
 |-- audio_avg_timbre: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio

# **Dataset Shape**

## **Print the 5 rows of the load dataset**

In [None]:
spotify_tracks.show(5)

+--------------------+-----------------+--------------+----------------+--------------------+------------------+----------------------------+---------------+--------------------+--------------------+------------------+------------------+-----------------+------------+----------------------+-----------+--------------+--------------+------------+-----------------+-----------+--------------------+-------------+--------------------+-----------+
|                  id|       track_name|track_explicit|track_popularity|          album_name|album_release_date|album_release_date_precision|    artist_name|   audio_avg_pitches|    audio_avg_timbre|audio_acousticness|audio_danceability|audio_duration_ms|audio_energy|audio_instrumentalness|audio_key_1|audio_liveness|audio_loudness|audio_mode_1|audio_speechiness|audio_tempo|audio_time_signature|audio_valence|           track_uri|track_genre|
+--------------------+-----------------+--------------+----------------+--------------------+-----------------

## **Preprocessing operations**

In this section of the notebbok we're going to perform some preprocessing operations (drop useless feature, encoding,...) to our dataset.

### Drop duplicates and split the columns using PySpark

In [None]:
spotify_tracks = spotify_tracks.dropDuplicates()

In [None]:
# Split String audio_avg_pitches to n columns
spotify_tracks = spotify_tracks.select(spotify_tracks.columns + [f.translate(f.col("audio_avg_pitches"), "[]", "").alias("audio_avg_pitches_list")])
spotify_tracks = spotify_tracks.withColumn("pitch_1", split(col("audio_avg_pitches_list"), ", ").getItem(0))\
                                .withColumn("pitch_2", split(col("audio_avg_pitches_list"), ", ").getItem(1))\
                                .withColumn("pitch_3", split(col("audio_avg_pitches_list"), ", ").getItem(2))\
                                .withColumn("pitch_4", split(col("audio_avg_pitches_list"), ", ").getItem(3))\
                                .withColumn("pitch_5", split(col("audio_avg_pitches_list"), ", ").getItem(4))\
                                .withColumn("pitch_6", split(col("audio_avg_pitches_list"), ", ").getItem(5))\
                                .withColumn("pitch_7", split(col("audio_avg_pitches_list"), ", ").getItem(6))\
                                .withColumn("pitch_8", split(col("audio_avg_pitches_list"), ", ").getItem(7))\
                                .withColumn("pitch_9", split(col("audio_avg_pitches_list"), ", ").getItem(8))\
                                .withColumn("pitch_10", split(col("audio_avg_pitches_list"), ", ").getItem(9))\
                                .withColumn("pitch_11", split(col("audio_avg_pitches_list"), ", ").getItem(10))\
                                .withColumn("pitch_12", split(col("audio_avg_pitches_list"), ", ").getItem(11))
spotify_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_avg_pitches: string (nullable = true)
 |-- audio_avg_timbre: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio

In [None]:
# Split String audio_avg_timbre to n columns
spotify_tracks = spotify_tracks.select(spotify_tracks.columns + [f.translate(f.col("audio_avg_timbre"), "[]", "").alias("audio_avg_timbre_list")])
spotify_tracks = spotify_tracks.withColumn("timbre_1", split(col("audio_avg_timbre_list"), ", ").getItem(0))\
                                .withColumn("timbre_2", split(col("audio_avg_timbre_list"), ", ").getItem(1))\
                                .withColumn("timbre_3", split(col("audio_avg_timbre_list"), ", ").getItem(2))\
                                .withColumn("timbre_4", split(col("audio_avg_timbre_list"), ", ").getItem(3))\
                                .withColumn("timbre_5", split(col("audio_avg_timbre_list"), ", ").getItem(4))\
                                .withColumn("timbre_6", split(col("audio_avg_timbre_list"), ", ").getItem(5))\
                                .withColumn("timbre_7", split(col("audio_avg_timbre_list"), ", ").getItem(6))\
                                .withColumn("timbre_8", split(col("audio_avg_timbre_list"), ", ").getItem(7))\
                                .withColumn("timbre_9", split(col("audio_avg_timbre_list"), ", ").getItem(8))\
                                .withColumn("timbre_10", split(col("audio_avg_timbre_list"), ", ").getItem(9))\
                                .withColumn("timbre_11", split(col("audio_avg_timbre_list"), ", ").getItem(10))\
                                .withColumn("timbre_12", split(col("audio_avg_timbre_list"), ", ").getItem(11))
spotify_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_avg_pitches: string (nullable = true)
 |-- audio_avg_timbre: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio

In [None]:
# String to Date but return null value 
# spotify_tracks_dt = spotify_tracks.withColumn("album_release_date", to_date(col("album_release_date"),"yyyy-MM-dd"))
# spotify_tracks_dt.select("album_release_date").show()

def to_date_(col, formats=("yyyy-MM-dd", "y")):
    # Spark 2.2 or later syntax, for < 2.2 use unix_timestamp and cast
    return coalesce(*[to_date(col, f) for f in formats])

spotify_tracks = spotify_tracks.withColumn("album_release_date_td", to_date_("album_release_date"))
spotify_tracks.select("album_release_date_td").show()

+---------------------+
|album_release_date_td|
+---------------------+
|           1994-01-01|
|           2003-03-24|
|           2008-05-12|
|           2013-04-26|
|           2011-01-01|
|           2008-04-17|
|           2013-04-18|
|           1994-01-01|
|           1994-03-28|
|           2007-03-09|
|           2009-07-10|
|           2010-02-07|
|           2011-03-21|
|           2009-05-03|
|           2001-01-01|
|           2008-06-02|
|           2010-03-29|
|           2012-06-28|
|           2007-04-02|
|           2011-08-08|
+---------------------+
only showing top 20 rows



In [None]:
# Split date to columns
spotify_tracks = spotify_tracks.withColumn('day', dayofmonth(col('album_release_date_td')))
spotify_tracks = spotify_tracks.withColumn('month', month(col('album_release_date_td')))
spotify_tracks = spotify_tracks.withColumn('year', year(col('album_release_date_td')))

In [None]:
spotify_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_avg_pitches: string (nullable = true)
 |-- audio_avg_timbre: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio

In [None]:
# Delete columns
cols = ("album_release_date", "album_release_date_td", "audio_avg_pitches","audio_avg_timbre", "audio_avg_pitches_list", "audio_avg_timbre_list", "track_uri", "id")
spotify_tracks = spotify_tracks.drop(*cols)

spotify_tracks.printSchema()

root
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio_time_signature: integer (nullable = true)
 |-- audio_valence: double (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- pitch_1: string (nullable = true)
 |-- pitch_

In [None]:
# Cast converted columns
spotify_tracks = spotify_tracks.withColumn("pitch_1", spotify_tracks.pitch_1.cast('double'))\
                                .withColumn("pitch_2", spotify_tracks.pitch_2.cast('double'))\
                                .withColumn("pitch_3", spotify_tracks.pitch_3.cast('double'))\
                                .withColumn("pitch_4", spotify_tracks.pitch_4.cast('double'))\
                                .withColumn("pitch_5", spotify_tracks.pitch_5.cast('double'))\
                                .withColumn("pitch_6", spotify_tracks.pitch_6.cast('double'))\
                                .withColumn("pitch_7", spotify_tracks.pitch_7.cast('double'))\
                                .withColumn("pitch_8", spotify_tracks.pitch_8.cast('double'))\
                                .withColumn("pitch_9", spotify_tracks.pitch_9.cast('double'))\
                                .withColumn("pitch_10", spotify_tracks.pitch_10.cast('double'))\
                                .withColumn("pitch_11", spotify_tracks.pitch_11.cast('double'))\
                                .withColumn("pitch_12", spotify_tracks.pitch_12.cast('double'))

spotify_tracks = spotify_tracks.withColumn("timbre_1", spotify_tracks.timbre_1.cast('double'))\
                                .withColumn("timbre_2", spotify_tracks.timbre_2.cast('double'))\
                                .withColumn("timbre_3", spotify_tracks.timbre_3.cast('double'))\
                                .withColumn("timbre_4", spotify_tracks.timbre_4.cast('double'))\
                                .withColumn("timbre_5", spotify_tracks.timbre_5.cast('double'))\
                                .withColumn("timbre_6", spotify_tracks.timbre_6.cast('double'))\
                                .withColumn("timbre_7", spotify_tracks.timbre_7.cast('double'))\
                                .withColumn("timbre_8", spotify_tracks.timbre_8.cast('double'))\
                                .withColumn("timbre_9", spotify_tracks.timbre_9.cast('double'))\
                                .withColumn("timbre_10", spotify_tracks.timbre_10.cast('double'))\
                                .withColumn("timbre_11", spotify_tracks.timbre_11.cast('double'))\
                                .withColumn("timbre_12", spotify_tracks.timbre_12.cast('double'))

spotify_tracks = spotify_tracks.withColumn("year", spotify_tracks.year.cast('int'))\
                                .withColumn("day", spotify_tracks.day.cast('int'))\
                                .withColumn("month", spotify_tracks.month.cast('int'))

spotify_tracks.printSchema()


root
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio_time_signature: integer (nullable = true)
 |-- audio_valence: double (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- pitch_1: double (nullable = true)
 |-- pitch_

In [None]:
# null values in each column
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
spotify_tracks.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in spotify_tracks.columns]).show()

+----------+--------------+----------------+----------+----------------------------+-----------+------------------+------------------+-----------------+------------+----------------------+-----------+--------------+--------------+------------+-----------------+-----------+--------------------+-------------+-----------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---+-----+----+
|track_name|track_explicit|track_popularity|album_name|album_release_date_precision|artist_name|audio_acousticness|audio_danceability|audio_duration_ms|audio_energy|audio_instrumentalness|audio_key_1|audio_liveness|audio_loudness|audio_mode_1|audio_speechiness|audio_tempo|audio_time_signature|audio_valence|track_genre|pitch_1|pitch_2|pitch_3|pitch_4|pitch_5|pitch_6|pitch_7|pitch_8|pitch_9|pitch_10|pitch_11|pitch_12|timbre_1|timbre_2|timbre_3|timbre_4

In [None]:
spotify_tracks = spotify_tracks.dropna()

In [None]:
# Reverse json 

import json

# Opening JSON file
f = open('map_genre_small.json')
map_genre = json.load(f)
f.close()

def invert_dict(d): 
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = key 
            #else: 
            #    inverse[item].append(key) 
    return inverse

map_genre_inv = invert_dict(map_genre)

In [None]:
udf_foo = udf(lambda x: map_genre_inv[x], StringType())
spotify_tracks = spotify_tracks.withColumn("track_genre", udf_foo("track_genre"))

In [None]:
spotify_tracks.groupBy('track_genre').count().toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
track_genre,pop,alternative,ambient,electronic,jazz,country,musical,metal,unclassified,rock,classical,latin
count,5995,6000,2722,13579,2896,5315,3189,3600,8950,5283,2393,5028


In [None]:
spotify_tracks = spotify_tracks.filter(spotify_tracks.track_genre != 'unclassified')
spotify_tracks.groupBy('track_genre').count().toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
track_genre,pop,alternative,ambient,electronic,jazz,country,musical,metal,rock,classical,latin
count,5995,6000,2722,13579,2896,5315,3189,3600,5283,2393,5028


In [None]:
spotify_tracks.cache()

DataFrame[track_name: string, track_explicit: boolean, track_popularity: int, album_name: string, album_release_date_precision: string, artist_name: string, audio_acousticness: double, audio_danceability: double, audio_duration_ms: int, audio_energy: double, audio_instrumentalness: double, audio_key_1: int, audio_liveness: double, audio_loudness: double, audio_mode_1: int, audio_speechiness: double, audio_tempo: double, audio_time_signature: int, audio_valence: double, track_genre: string, pitch_1: double, pitch_2: double, pitch_3: double, pitch_4: double, pitch_5: double, pitch_6: double, pitch_7: double, pitch_8: double, pitch_9: double, pitch_10: double, pitch_11: double, pitch_12: double, timbre_1: double, timbre_2: double, timbre_3: double, timbre_4: double, timbre_5: double, timbre_6: double, timbre_7: double, timbre_8: double, timbre_9: double, timbre_10: double, timbre_11: double, timbre_12: double, day: int, month: int, year: int]

### Genre Prevision Dataset

In [None]:
# Balance and create dataset for genre prevision
fractions_dict = spotify_tracks.groupBy('track_genre').count().toPandas().to_dict()

fraction = {}
for genre in fractions_dict["track_genre"]:
  g = fractions_dict["track_genre"][genre]
  # fraction[g] = fractions_dict["count"][genre] / spotify_tracks.count()
  fraction[g] = 2000 / fractions_dict["count"][genre] 

del fractions_dict
spotify_tracks_genre = spotify_tracks.sampleBy("track_genre", fractions=fraction, seed=0)
spotify_tracks_genre.groupBy('track_genre').count().toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
track_genre,pop,alternative,ambient,electronic,jazz,country,musical,metal,rock,classical,latin
count,2080,2011,1990,2067,2016,1993,2020,2001,1989,1963,1989


### Popularity Prevision Dataset

#### **Discretize the target class**

In this task our target class is the popularity of the track, i.e. the "track_popularity" feature in our dataset. We want to discretize this class in 10 classes (0-9). A popularity equals to 0 means that the track is unknows, instead it means that the track is known by everyone.

#### Bucketizer to discretize the track_popularity

In [None]:
splits = [i for i in range(0,101)]
splits = splits[::10]
splits

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [None]:
buck = Bucketizer(splits=splits,inputCol="track_popularity",outputCol="track_popularity_bucket")
spotify_tracks_buck = buck.transform(spotify_tracks)

# Delete track_popularity column
spotify_tracks_buck = spotify_tracks_buck.drop("track_popularity")

spotify_tracks_buck.groupBy('track_popularity_bucket').count().show()

+-----------------------+-----+
|track_popularity_bucket|count|
+-----------------------+-----+
|                    0.0|31380|
|                    7.0|  849|
|                    1.0| 4925|
|                    4.0| 3835|
|                    3.0| 4833|
|                    2.0| 4978|
|                    6.0| 2116|
|                    5.0| 3003|
|                    8.0|   81|
+-----------------------+-----+



In [None]:
# now we want named for each bucket
# if 1 < popularity <= 3: low
# if 3 < popularity <= 5: medium
# if 5 < popularity <= 9: high

# to perform this operation we use udf that allows to create a new column with 
# bucket names 

t = {0.0:"unknow", 1.0:"low", 2.0:"low", 3.0:"medium", 4.0:"medium", 5.0:"medium", 
     6.0:"high", 7.0:"high", 8.0:"high", 9.0:"high"}
udf_foo = udf(lambda x: t[x], StringType())
spotify_tracks_buck = spotify_tracks_buck.withColumn("track_popularity_bucket", udf_foo("track_popularity_bucket"))

In [None]:
spotify_tracks_buck.groupBy('track_popularity_bucket').count().toPandas().transpose()

Unnamed: 0,0,1,2,3
track_popularity_bucket,low,high,medium,unknow
count,9903,3046,11671,31380


Balance the datset


In [None]:
fractions_dict = spotify_tracks_buck.groupBy('track_popularity_bucket').count().toPandas().to_dict()

fraction = {}
for popularity in fractions_dict["track_popularity_bucket"]:
  g = fractions_dict["track_popularity_bucket"][popularity]
  # fraction[g] = fractions_dict["count"][genre] / spotify_tracks.count()
  fraction[g] = 3000 / fractions_dict["count"][popularity] 

del fractions_dict
spotify_tracks_popularity = spotify_tracks_buck.sampleBy("track_popularity_bucket", fractions=fraction, seed=0)
spotify_tracks_popularity.groupBy('track_popularity_bucket').count().toPandas().transpose()

Unnamed: 0,0,1,2,3
track_popularity_bucket,low,high,medium,unknow
count,2970,3012,3067,3138


# **Data Exploration && Encoding**

## **Summary of Descriptive Statistics**

In [None]:
# Set some default plotting configuration using seaborn properties
sns.set_style("darkgrid")
sns.set_context("notebook", rc={"lines.linewidth": 2, 
                                "xtick.labelsize":14, 
                                "ytick.labelsize":14,
                                "axes.labelsize": 18,
                                "axes.titlesize": 20,
                                })

### **Analysis of Data Distributions for the Popularity Model**

#### 1. Distributions of individual numerical features:

In [None]:
# Let's define some constants which we will use throughout this notebook
NUMERICAL_FEATURES_POPULARITY = []
CATEGORICAL_FEATURES_POPULARITY = []
TARGET_VARIABLE_POPULARITY = "track_popularity_bucket"

#Get All column names and it's types
for col in spotify_tracks_popularity.dtypes:
    if col[1] == "string":
        CATEGORICAL_FEATURES_POPULARITY.append(col[0])
    else:
        NUMERICAL_FEATURES_POPULARITY.append(col[0])

CATEGORICAL_FEATURES_POPULARITY.remove(TARGET_VARIABLE_POPULARITY)

print("Categorical Popularity Dataset: ", CATEGORICAL_FEATURES_POPULARITY)
print("Numerical Popularity Dataset: ", NUMERICAL_FEATURES_POPULARITY)

Categorical Popularity Dataset:  ['track_name', 'album_name', 'album_release_date_precision', 'artist_name', 'track_genre']
Numerical Popularity Dataset:  ['track_explicit', 'audio_acousticness', 'audio_danceability', 'audio_duration_ms', 'audio_energy', 'audio_instrumentalness', 'audio_key_1', 'audio_liveness', 'audio_loudness', 'audio_mode_1', 'audio_speechiness', 'audio_tempo', 'audio_time_signature', 'audio_valence', 'pitch_1', 'pitch_2', 'pitch_3', 'pitch_4', 'pitch_5', 'pitch_6', 'pitch_7', 'pitch_8', 'pitch_9', 'pitch_10', 'pitch_11', 'pitch_12', 'timbre_1', 'timbre_2', 'timbre_3', 'timbre_4', 'timbre_5', 'timbre_6', 'timbre_7', 'timbre_8', 'timbre_9', 'timbre_10', 'timbre_11', 'timbre_12', 'day', 'month', 'year']


In [None]:
# Plot the distribution of values of each column of interest
n_rows = 4
n_cols = 2

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14,20))

for i,f in enumerate(NUMERICAL_FEATURES_POPULARITY):
    _ = sns.distplot(spotify_tracks_popularity[f],
                    kde_kws={"color": "#ca0020", "lw": 1}, 
                    hist_kws={"histtype": "bar", "edgecolor": "k", "linewidth": 1,"alpha": 0.8, "color": "#92c5de"},
                    ax=axes[i//n_cols, i%n_cols]
                    )

fig.delaxes(axes[3][1]) # Remove the last cell of the plot

fig.tight_layout(pad=1.5)

#### 2. Pairwise regression plots

In [None]:
# Let's now plot the pairwise relationship between our numerical features
_ = sns.pairplot(data=spotify_tracks_popularity, 
                 vars=sorted(NUMERICAL_FEATURES_POPULARITY), 
                 hue=TARGET_VARIABLE_POPULARITY, 
                 kind="reg",
                 diag_kind='hist',
                 diag_kws = {'alpha':0.55, 'bins':20},
                 markers=["o", "s"]
                )

### **Analysis of Data Distributions for the Genre Model**

In [None]:
# Let's define some constants which we will use throughout this notebook
NUMERICAL_FEATURES_GENRE = []
CATEGORICAL_FEATURES_GENRE = []
TARGET_VARIABLE_GENRE = "track_genre"

#Get All column names and it's types
for col in spotify_tracks_genre.dtypes:
    if col[1] == "string":
        CATEGORICAL_FEATURES_GENRE.append(col[0])
    else:
        NUMERICAL_FEATURES_GENRE.append(col[0])

CATEGORICAL_FEATURES_GENRE.remove(TARGET_VARIABLE_GENRE)

print("Categorical Genre Dataset: ", CATEGORICAL_FEATURES_GENRE)
print("Numerical Genre Dataset: ", NUMERICAL_FEATURES_GENRE)

Categorical Genre Dataset:  ['track_name', 'album_name', 'album_release_date_precision', 'artist_name']
Numerical Genre Dataset:  ['track_explicit', 'track_popularity', 'audio_acousticness', 'audio_danceability', 'audio_duration_ms', 'audio_energy', 'audio_instrumentalness', 'audio_key_1', 'audio_liveness', 'audio_loudness', 'audio_mode_1', 'audio_speechiness', 'audio_tempo', 'audio_time_signature', 'audio_valence', 'pitch_1', 'pitch_2', 'pitch_3', 'pitch_4', 'pitch_5', 'pitch_6', 'pitch_7', 'pitch_8', 'pitch_9', 'pitch_10', 'pitch_11', 'pitch_12', 'timbre_1', 'timbre_2', 'timbre_3', 'timbre_4', 'timbre_5', 'timbre_6', 'timbre_7', 'timbre_8', 'timbre_9', 'timbre_10', 'timbre_11', 'timbre_12', 'day', 'month', 'year']


### **Analysis of Data Distributions: Categorical Features**

#### 1. Histograms of individual categorical features

In [None]:
# For categorical variables, 'countplot' is the way to go
# Create a Figure containing 3x3 subplots
n_rows = 3
n_cols = 3

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14,14))

for i,f in enumerate(sorted(CATEGORICAL_FEATURES_GENRE)): 
    ax = sns.countplot(spotify_tracks_genre[f], ax=axes[i//n_cols, i%n_cols])
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')

fig.tight_layout(pad=1.5)

#### 2. Relationship between _categorical_ features and the _target variable_ (`track_genre`)

In [None]:
n_rows = 3
n_cols = 3

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14,14))

i = 0
for c in sorted(CATEGORICAL_FEATURES_GENRE):
    tmp_data = pd.crosstab(spotify_tracks_genre.loc[:, c], spotify_tracks_genre[TARGET_VARIABLE_GENRE])
    # pandas.crosstab returns an mxn table where m is the number of values for the first argument (x) 
    # and n for the second argument (y)
    # As the second argument is always `TARGET_VARIABLE` (i.e., `track_genre`)
    # e.g., x = 'housing'; y = 'deposit'
    # the following apply is used to transform the crosstab into a "normalized" table as follows:
    # each entry in the table displays how the i-th categorical value of x (i.e., i-th row) is distributed across
    # all the possible values of y (i.e., Y/N)
    tmp_data = tmp_data.apply(lambda x: x/tmp_data.sum(axis=1))
    ax = tmp_data.plot.bar(stacked=True, color=['red','green'], grid=False, ax=axes[i//n_cols, i % n_cols], legend=True)
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    i += 1

fig.tight_layout(pad=1.5)

## **Encoding phase**

### Transform Categorical features into Numerical using One-Hot Encoding

Note that this step is not always mandatory (e.g., decision trees are able to work nicely with categorical features without the need of transforming them to numerical). Still, other methods (like logistic regression) are designed to operate with numerical inputs only.

To transform _categorical_ features into _numerical_ ones we proceed as follows.
We setup a pipeline which is composed of the following steps:
- [`StringIndexer`](https://spark.apache.org/docs/latest/ml-features#stringindexer): encodes a string column of labels to a column of label indices. The indices are in `[0, numLabels)`, and 4 ordering options are supported (default `frequencyDesc`, which assigns the most frequent label the index `0`, and so on and so forth).
- [`OneHotEncoderEstimator`](https://spark.apache.org/docs/latest/ml-features#onehotencoderestimator): maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values. An important parameter is `handleInvalid`, which indicates how to deal with previously unseen labels. By default this raises an error but it can be set to as `keep` to assign previously unseen labels a fallback value.
- [`VectorAssembler`](https://spark.apache.org/docs/latest/ml-features#vectorassembler): is a transformer that combines a given list of columns into a single vector column.

In [None]:
# This function is responsible to implement the pipeline above for transforming categorical features into numerical ones
def to_numerical(df, numerical_features, categorical_features, target_variable):
  
    # 1. Label Encode target feature
    # In this case our target is the popularity of the tracks that is just
    # numeric in our dataframe, so there is no need to encode it 
    stage_1= StringIndexer(inputCol=target_variable, outputCol='label')

    # 2. Label Encode Categorical features
    stage_2 = [StringIndexer(inputCol=c, outputCol="{0}_index".format(c), handleInvalid="skip") for c in categorical_features]

    # 3. OneHot Encode 
    # stage_3 = OneHotEncoder(inputCol='album_release_date_precision_index', outputCol='album_release_date_precision_oh')

    # 4. create a vector of all the features required to train the logistic regression model 
    # encoded_columns = ['track_name_index', 'album_name_index', 'artist_name_index', 'album_release_date_precision_oh']
    # stage_4 = VectorAssembler(inputCols= encoded_columns + numerical_features, outputCol='features')
    stage_4 = VectorAssembler(inputCols= [indexer.getOutputCol() for indexer in stage_2] + numerical_features, outputCol='features')

    # 4.a Create the StandardScaler
    scaler = StandardScaler(inputCol=stage_4.getOutputCol(), outputCol="std_" + stage_4.getOutputCol(), withStd=True, withMean=True)

    # 5. Populate the stages of the pipeline
    stages = [stage_1] + stage_2 + [stage_4] + [scaler]

    # 6. Setup the pipeline with the stages above
    pipeline = Pipeline(stages=stages)

    # 7. Transform the input dataframe accordingly
    transformer = pipeline.fit(df)
    df_transformed = transformer.transform(df)

    return df_transformed

In [None]:
spotify_tracks.cache()

DataFrame[track_name: string, track_explicit: boolean, track_popularity: int, album_name: string, album_release_date_precision: string, artist_name: string, audio_acousticness: double, audio_danceability: double, audio_duration_ms: int, audio_energy: double, audio_instrumentalness: double, audio_key_1: int, audio_liveness: double, audio_loudness: double, audio_mode_1: int, audio_speechiness: double, audio_tempo: double, audio_time_signature: int, audio_valence: double, track_genre: string, pitch_1: double, pitch_2: double, pitch_3: double, pitch_4: double, pitch_5: double, pitch_6: double, pitch_7: double, pitch_8: double, pitch_9: double, pitch_10: double, pitch_11: double, pitch_12: double, timbre_1: double, timbre_2: double, timbre_3: double, timbre_4: double, timbre_5: double, timbre_6: double, timbre_7: double, timbre_8: double, timbre_9: double, timbre_10: double, timbre_11: double, timbre_12: double, day: int, month: int, year: int]

## **Split the Dataset into train and test set**
### **Dataset Splitting: Training vs. Test Set**

Before moving along with any preprocessing involving data transformations, we will split our dataset into **2** portions:
- _training set_ (e.g., accounting for **80%** of the total number of instances);
- _test set_ (e.g., accounting for the remaining **20%** of instances)

In [None]:
# GENRE DATASET

# Transform the training set and get back both the transformer and the new dataset
spotify_tracks_genre = to_numerical(spotify_tracks_genre, NUMERICAL_FEATURES_GENRE, CATEGORICAL_FEATURES_GENRE, TARGET_VARIABLE_GENRE)
spotify_tracks_genre.cache()

# Select `features` and `label` (i.e., formerly `deposit`) target variable only
spotify_tracks_genre = spotify_tracks_genre.select(["std_features", "label"])
spotify_tracks_genre.cache()

RANDOM_SEED = 42
# Randomly split our original dataset `house_df` into 80÷20 for training and test, respectively
train_set_genre, test_set_genre = spotify_tracks_genre.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

train_set_genre.show()

+--------------------+-----+
|        std_features|label|
+--------------------+-----+
|[-1.5432572913072...|  8.0|
|[-1.5304963122137...|  4.0|
|[-1.5281036286336...|  2.0|
|[-1.5231587492349...|  1.0|
|[-1.4638201964498...|  6.0|
|[-1.4571206824257...|  0.0|
|[-1.4504211684015...|  0.0|
|[-1.4256967714078...|  6.0|
|[-1.4068743272447...|  4.0|
|[-1.4027270090393...|  2.0|
|[-1.3757694407042...|  2.0|
|[-1.3596587045985...|  2.0|
|[-1.3392411380488...| 10.0|
|[-1.3162713756804...| 10.0|
|[-1.2944181989827...| 10.0|
|[-1.2097172002491...|  1.0|
|[-1.1977537823489...|  7.0|
|[-1.1105005877967...|  2.0|
|[-1.0604137448544...|  1.0|
|[-0.8914902841033...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [None]:
# POPULARITY DATASET

# Transform the training set and get back both the transformer and the new dataset
spotify_tracks_popularity = to_numerical(spotify_tracks_popularity, NUMERICAL_FEATURES_POPULARITY, CATEGORICAL_FEATURES_POPULARITY, TARGET_VARIABLE_POPULARITY)
spotify_tracks_popularity.cache()

# Select `features` and `label` (i.e., formerly `deposit`) target variable only
spotify_tracks_popularity = spotify_tracks_popularity.select(["std_features", "label"])
spotify_tracks_popularity.cache()

RANDOM_SEED = 42
# Randomly split our original dataset `house_df` into 80÷20 for training and test, respectively
train_set_popularity, test_set_popularity = spotify_tracks_popularity.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

train_set_popularity.show()

+--------------------+-----+
|        std_features|label|
+--------------------+-----+
|[-1.5702805019363...|  0.0|
|[-1.5619422607606...|  1.0|
|[-1.5268641427108...|  2.0|
|[-1.5127753903793...|  2.0|
|[-1.5087500325703...|  0.0|
|[-1.4072535106722...|  0.0|
|[-1.2111610802629...|  1.0|
|[-1.1599815309772...|  3.0|
|[-1.1594064798616...|  3.0|
|[-0.9745275462059...|  0.0|
|[-0.9075340912420...|  3.0|
|[-0.8692931920566...|  1.0|
|[-0.7948240725903...|  1.0|
|[-0.6358224391352...|  1.0|
|[-0.6016068977588...|  3.0|
|[-0.4670449367154...|  1.0|
|[-0.4480682499016...|  2.0|
|[-0.3641107870284...|  1.0|
|[-0.3204069022451...|  1.0|
|[-0.2097095624979...|  2.0|
+--------------------+-----+
only showing top 20 rows



# Training

## Decision Tree

In [None]:
# This function defines the general pipeline for logistic regression
def decision_tree_pipeline(train):
  
  stage_5_dr = DecisionTreeClassifier(featuresCol='std_features',labelCol='label')

  decision_tree_pipeline = Pipeline(stages= [stage_5_dr])

  #### DECISION TREE
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_dr.maxDepth, [3, 5, 8]) \
    .addGrid(stage_5_dr.impurity, ["gini", "entropy"]) \
    .build()
  cross_val_dt = CrossValidator(estimator=decision_tree_pipeline,
                                estimatorParamMaps=param_grid,
                                evaluator=MulticlassClassificationEvaluator(),
                                numFolds=5,
                                collectSubModels=True
                                )
  cv_model_dt = cross_val_dt.fit(train)

  return cv_model_dt

In [None]:
cv_model_genre_dt = decision_tree_pipeline(train_set_genre)

test_predictions_genre_dt = cv_model_genre_dt.transform(test_set_genre)

test_predictions_genre_dt.select("std_features", "prediction", "label").show(5)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 46818)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/clientserver.py", line 480, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/dist-packages/py4j/clientserver.py", line 504, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/usr/lib/python3.7/socketserver.py", line 316, in _handle_request

Py4JError: ignored

In [None]:
cv_model_popularity_dt = decision_tree_pipeline(train_set_popularity)

test_predictions_popularity_dt = cv_model_popularity_dt.transform(test_set_popularity)

test_predictions_popularity_dt.select("std_features", "prediction", "label").show(5)

+--------------------+----------+-----+
|        std_features|prediction|label|
+--------------------+----------+-----+
|[-1.5702805019363...|       0.0|  0.0|
|[-1.5619422607606...|       1.0|  1.0|
|[-1.5268641427108...|       2.0|  2.0|
|[-1.5127753903793...|       2.0|  2.0|
|[-1.5087500325703...|       2.0|  0.0|
+--------------------+----------+-----+
only showing top 5 rows



## **Logistic Regression**

We first train a logistic regression model, using the training set above. To do so, we use the `LogisticRegression` object provided by the [PySpark API](https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression) within the package `pyspark.ml.classification`.

The API is similar to the one we have seen for Linear Regression (i.e., implementing the **Elastic Net** regularization framework), except for the loss function which now is **cross-entropy** rather than **mean squared error**:
$$
\boldsymbol{\theta}^* = \text{argmin}_{\boldsymbol{\theta}\in \mathbb{R}^n} \frac{1}{m} \sum_{i=1}^m \log_e(1 + e^{-y_i\boldsymbol{\theta}^T\mathbf{x}_i}) + \lambda\Big(\alpha |\boldsymbol{\theta}| + (1-\alpha)||\boldsymbol{\theta}||^2\Big)
$$
In particular, we can specify the following parameters:

- `regParam` is the regularization parameter (or $\lambda$);
- `elasticNetParam` is the tradeoff parameter for regularization penalties (or $\alpha$);
  - `regParam = 0` and `elasticNetParam = 0` means there is no regularization;
  - `regParam > 0` and `elasticNetParam = 0` means there is only L2-regularization; 
  - `regParam > 0` and `elasticNetParam = 1` means there is only L1-regularization;
  - `regParam > 0` and `0 < elasticNetParam < 1` means there is both L1- and L2-regularization (Elastic Net);

As it is always the case, the optimal values of those **hyperparameters** should be tuned using a dedicated portion of the dataset (i.e., **validation set**) or by performing $k$**-fold cross validation**.

In [None]:
# This function defines the general pipeline for logistic regression
def logistic_regression_pipeline(train):
  
    stage_5_lg = LogisticRegression(featuresCol='std_features',labelCol='label')

    logistic_regression_pipeline = Pipeline(stages= [stage_5_lg])

    #### LOGISTIC REGRESSION
    param_grid = ParamGridBuilder()\
    .addGrid(stage_5_lg.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(stage_5_lg.maxIter, [10, 20, 50]) \
    .build()
    # other param: .addGrid(stage_4_lg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
    cross_val_lg = CrossValidator(estimator=logistic_regression_pipeline,
                            estimatorParamMaps=param_grid,
                            evaluator=MulticlassClassificationEvaluator(), 
                            numFolds=5,
                            collectSubModels=True
                            )
    cv_model_lg = cross_val_lg.fit(train)

    return cv_model_lg

In [None]:
cv_model_genre_lg = logistic_regression_pipeline(train_set_genre)

test_predictions_genre_lg = cv_model_genre_lg.transform(test_set_genre)

test_predictions_genre_lg.select("std_features", "prediction", "label").show(5)

In [None]:
cv_model_popularity_lg = logistic_regression_pipeline(train_set_popularity)

test_predictions_popularity_lg = cv_model_popularity_lg.transform(test_set_popularity)

test_predictions_popularity_lg.select("std_features", "prediction", "label").show(5)

+--------------------+----------+-----+
|        std_features|prediction|label|
+--------------------+----------+-----+
|[-1.5300269238464...|       0.0|  2.0|
|[-1.4664837755759...|       0.0|  2.0|
|[-1.3951774372453...|       3.0|  3.0|
|[-0.9391619025983...|       0.0|  0.0|
|[-0.4676199878310...|       3.0|  2.0|
+--------------------+----------+-----+
only showing top 5 rows



## Random Forest

In [None]:
# This function defines the general pipeline for logistic regression
def random_forest_pipeline(train):
  
  stage_5_rf = RandomForestClassifier(featuresCol="std_features", labelCol="label")

  random_forest_pipeline = Pipeline(stages= [stage_5_rf])

  #### RANDOM FOREST
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_rf.maxDepth, [3, 5, 8]) \
    .addGrid(stage_5_rf.numTrees, [10, 50, 100]) \
    .build()

  cross_val_rf = CrossValidator(estimator=random_forest_pipeline, 
                              estimatorParamMaps=param_grid,
                              evaluator= MulticlassClassificationEvaluator(),
                              numFolds=5,
                              collectSubModels=True 
                              )
  cv_model_rf = cross_val_rf.fit(train)

  return cv_model_rf

In [None]:
cv_model_genre_rf = random_forest_pipeline(train_set_genre)

test_predictions_genre_rf = cv_model_genre_rf.transform(test_set_genre)

test_predictions_genre_rf.select("std_features", "prediction", "label").show(5)

In [None]:
cv_model_popularity_rf = random_forest_pipeline(train_set_popularity)

test_predictions_popularity_rf = cv_model_popularity_rf.transform(test_set_popularity)

test_predictions_popularity_rf.select("std_features", "prediction", "label").show(5)

# Evaluation


In [None]:
def evaluate(cv_model, test_prediction):

  for i, avg_roc_auc in enumerate(cv_model.avgMetrics):
      print("Avg. ROC AUC computed across k-fold cross validation for model setting #{:d}: {:.3f}".format(i+1, avg_roc_auc))

  print(cv_model.bestModel.stages[-1])

  print("##### Test Set #####")
  evaluator = cv_model.getEvaluator()
  print('Test F1-Score ',   evaluator.evaluate(test_prediction, {evaluator.metricName: 'f1'}))
  print('Test Precision ',  evaluator.evaluate(test_prediction, {evaluator.metricName: 'weightedPrecision'}))
  print('Test Recall ',     evaluator.evaluate(test_prediction, {evaluator.metricName: 'weightedRecall'}))
  print('Test Accuracy ',   evaluator.evaluate(test_prediction, {evaluator.metricName: 'accuracy'}))
  print("####################")

## Decision Tree

In [None]:
print("Best model according to k-fold cross validation: maxDept=[{:d}]".
      format(cv_model_genre_dt.bestModel.stages[-1]._java_obj.getMaxDepth()
             )
      )
evaluate(cv_model_genre_dt, test_predictions_genre_dt)

In [None]:
print("Best model according to k-fold cross validation: maxDept=[{:d}]".
      format(cv_model_popularity_dt.bestModel.stages[-1]._java_obj.getMaxDepth()
             )
      )
evaluate(cv_model_popularity_dt, test_predictions_popularity_dt)

## Logistic Regression

In [None]:
print("Best model according to k-fold cross validation: lambda=[{:.3f}]; alfa=[{:.3f}]".
      format(cv_model_genre_lg.bestModel.stages[-1]._java_obj.getRegParam(),
             cv_model_genre_lg.bestModel.stages[-1]._java_obj.getElasticNetParam()
             )
      )

evaluate(cv_model_genre_lg, test_predictions_genre_lg)

In [None]:
print("Best model according to k-fold cross validation: lambda=[{:.3f}]; alfa=[{:.3f}]".
      format(cv_model_popularity_lg.bestModel.stages[-1]._java_obj.getRegParam(),
             cv_model_popularity_lg.bestModel.stages[-1]._java_obj.getElasticNetParam()
             )
      )

evaluate(cv_model_popularity_lg, test_predictions_popularity_lg)

## Random Forest

In [None]:
print("Best model according to k-fold cross validation: maxDept=[{:d}]".
      format(cv_model_genre_rf.bestModel.stages[-1]._java_obj.getMaxDepth()
             )
      )
evaluate(cv_model_genre_rf, test_predictions_genre_rf)

In [None]:
print("Best model according to k-fold cross validation: maxDept=[{:d}]".
      format(cv_model_popularity_rf.bestModel.stages[-1]._java_obj.getMaxDepth()
             )
      )
evaluate(cv_model_popularity_rf, test_predictions_popularity_rf)