In [1]:
!pip install pyspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1 

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 29 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 42.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=7c679cbee5abd07f38f7710e0eaed1828cb51c2b291d75729a15e9d98d0244a1
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from tqdm import tqdm

import requests
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator #, BinaryClassificationEvaluator 

# Basic libreries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pre-processing phase
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Features Importance
from sklearn.inspection import permutation_importance

# Model
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Hyper-Parameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score

# Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
# Create the session
conf = SparkConf().set("spark.ui.port", "4050").set('spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
GDRIVE_DIR = "/content/gdrive" # Your own mount point on Google Drive
GDRIVE_HOME_DIR = GDRIVE_DIR + "/My Drive" # Your own home directory
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR +  "/Sapienza/Primo Anno/Big Data Computing/Project"

# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

DATASET_URL = "https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv"
GDRIVE_DATASET_FILE = GDRIVE_DATA_DIR + "/" + DATASET_URL.split("/")[-1]

Mounted at /content/gdrive


In [6]:
import requests

"""
This function downloads a file from a specific URL directly to Google Drive.
"""
def get_data(dataset_url, dest, chunk_size=1024):
  response = requests.get(dataset_url, stream=True)
  if response.status_code == 200: # Test if everything went ok
    with open(dest, "wb") as file:
      for block in response.iter_content(chunk_size=chunk_size): 
        if block: 
          file.write(block)

In [7]:
print("Retrieving dataset from URL: {} ...".format(DATASET_URL))
get_data(DATASET_URL, GDRIVE_DATASET_FILE)
print("Dataset successfully retrieved and stored at: {}".format(GDRIVE_DATASET_FILE))

Retrieving dataset from URL: https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv ...
Dataset successfully retrieved and stored at: /content/gdrive/My Drive/Sapienza/Primo Anno/Big Data Computing/Project/dataframe.csv


In [8]:
spotify_tracks = spark.read.load(GDRIVE_DATASET_FILE, 
                           format="csv", 
                           sep=";", 
                           inferSchema="true", 
                           header="true")

### **Check the shape of the loaded dataset, i.e., number of rows and columns**

In [9]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(spotify_tracks.count(), len(spotify_tracks.columns)))

The shape of the dataset is 79339 rows by 25 columns


### **Print out the schema of the loaded dataset**

In [10]:
spotify_tracks.printSchema()

root
 |-- id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_explicit: boolean (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_avg_pitches: string (nullable = true)
 |-- audio_avg_timbre: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: integer (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: integer (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: integer (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio

In [11]:
spotify_tracks.show(5)

+--------------------+-----------------+--------------+----------------+--------------------+------------------+----------------------------+---------------+--------------------+--------------------+------------------+------------------+-----------------+------------+----------------------+-----------+--------------+--------------+------------+-----------------+-----------+--------------------+-------------+--------------------+-----------+
|                  id|       track_name|track_explicit|track_popularity|          album_name|album_release_date|album_release_date_precision|    artist_name|   audio_avg_pitches|    audio_avg_timbre|audio_acousticness|audio_danceability|audio_duration_ms|audio_energy|audio_instrumentalness|audio_key_1|audio_liveness|audio_loudness|audio_mode_1|audio_speechiness|audio_tempo|audio_time_signature|audio_valence|           track_uri|track_genre|
+--------------------+-----------------+--------------+----------------+--------------------+-----------------

In [12]:
spotify_tracks_pd = spotify_tracks.toPandas()

spotify_tracks_pd['audio_avg_pitches'] = spotify_tracks_pd['audio_avg_pitches'].str.strip('][').str.split(', ')
spotify_tracks_pd['audio_avg_timbre'] = spotify_tracks_pd['audio_avg_timbre'].str.strip('][').str.split(', ')

In [13]:
split_pitch = pd.DataFrame(spotify_tracks_pd['audio_avg_pitches'].tolist(), columns=["pitch" + str(i) for i in range(12)])
split_pitch = split_pitch.astype(float)

split_timbre = pd.DataFrame(spotify_tracks_pd['audio_avg_timbre'].tolist(), columns=["timbre" + str(i) for i in range(12)])
split_timbre = split_timbre.astype(float)

In [14]:
spotify_tracks_pd = pd.concat([spotify_tracks_pd, split_pitch], axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('audio_avg_pitches', axis=1)

spotify_tracks_pd = pd.concat([spotify_tracks_pd, split_timbre], axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('audio_avg_timbre', axis=1)

spotify_tracks_pd

Unnamed: 0,id,track_name,track_explicit,track_popularity,album_name,album_release_date,album_release_date_precision,artist_name,audio_acousticness,audio_danceability,...,timbre2,timbre3,timbre4,timbre5,timbre6,timbre7,timbre8,timbre9,timbre10,timbre11
0,2dLLR6qlu5UJ5gk0dKz0h3,Royals,False,76,Pure Heroine,2013-09-27,day,Lorde,121.00,674.00,...,-39.662296,-14.931671,11.747787,-11.336286,-4.114898,-3.412426,3.759076,3.758684,-17.167142,-9.604632
1,7AYoXqCtME90flUOpBJM7i,Society,False,0,Music For The Motion Picture Into The Wild,2007-12-17,day,Eddie Vedder,896.00,0.64,...,-35.216448,-16.005859,6.064246,-13.867221,-9.606810,-11.765366,-13.161886,-5.430175,-7.123225,4.989060
2,2bbhyUWJ5VjdfI3P4PRLu2,Samson,False,47,Begin to Hope (Special Edition),2006-06-13,day,Regina Spektor,862.00,0.49,...,3.536783,-4.610718,42.023948,-7.751785,-14.671814,-2.853136,-1.024664,-9.525564,-13.040564,1.721167
3,7zkLpY72g6lKQbiHDqri1S,Sunrise,False,69,Feels Like Home,2004-01-01,day,Norah Jones,941.00,526.00,...,-58.488764,-21.646289,31.252726,-28.576009,-9.370972,-4.624142,0.113758,1.522586,-8.494612,3.642050
4,7z7fquRFQFXt4Dj7ouWETq,Hey There Delilah,False,0,Hey There Delilah,2007-01-01,day,Plain White T's,0.84,661.00,...,-38.608739,-31.423893,29.068654,-21.289885,-6.442195,-4.644703,-7.456635,-2.027032,-11.075258,5.339601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79334,3r9K3hDcw6LCQoRkw64toA,Ha Po Zamani,False,0,Pata Pata,1967,year,Miriam Makeba,633.00,538.00,...,18.841716,-7.507701,31.243190,-11.973495,-4.551287,4.127084,-3.795970,0.416921,-9.907186,3.189979
79335,3GoRMfxfqG7M4AD2GPgFe9,Buena suerte,False,0,Revolución,2005-05-03,day,Raul Paz,115.00,784.00,...,-10.090846,0.289405,38.133045,-18.138713,-9.160653,4.633498,-4.102503,7.200279,-12.955741,-6.845653
79336,0C6rRCFz4F0b2W67KzjlXj,Farha,False,0,Electric Sufi,2001-01-01,day,Dhafer Youssef,785.00,544.00,...,-32.158389,15.119602,34.528068,-27.151201,-0.161955,-7.179403,-4.860860,3.993591,-10.775201,0.268727
79337,1wkZeRlDIBAY7AGluKpvyA,Menina,False,0,Modern World Music,2013-10-01,day,Waldemar Bastos,563.00,679.00,...,-45.666430,-11.716759,3.461417,-22.864194,-12.699123,0.802199,-6.022625,-0.135679,-8.163775,7.866830


In [15]:
spotify_tracks_pd['album_release_date'] = pd.to_datetime(spotify_tracks_pd['album_release_date'])
spotify_tracks_pd['year']= spotify_tracks_pd['album_release_date'].dt.year
spotify_tracks_pd['month']= spotify_tracks_pd['album_release_date'].dt.month
spotify_tracks_pd['day']= spotify_tracks_pd['album_release_date'].dt.day

spotify_tracks_pd = spotify_tracks_pd.drop('album_release_date', axis=1)

spotify_tracks_pd

Unnamed: 0,id,track_name,track_explicit,track_popularity,album_name,album_release_date_precision,artist_name,audio_acousticness,audio_danceability,audio_duration_ms,...,timbre5,timbre6,timbre7,timbre8,timbre9,timbre10,timbre11,year,month,day
0,2dLLR6qlu5UJ5gk0dKz0h3,Royals,False,76,Pure Heroine,day,Lorde,121.00,674.00,190185,...,-11.336286,-4.114898,-3.412426,3.759076,3.758684,-17.167142,-9.604632,2013,9,27
1,7AYoXqCtME90flUOpBJM7i,Society,False,0,Music For The Motion Picture Into The Wild,day,Eddie Vedder,896.00,0.64,236307,...,-13.867221,-9.606810,-11.765366,-13.161886,-5.430175,-7.123225,4.989060,2007,12,17
2,2bbhyUWJ5VjdfI3P4PRLu2,Samson,False,47,Begin to Hope (Special Edition),day,Regina Spektor,862.00,0.49,189507,...,-7.751785,-14.671814,-2.853136,-1.024664,-9.525564,-13.040564,1.721167,2006,6,13
3,7zkLpY72g6lKQbiHDqri1S,Sunrise,False,69,Feels Like Home,day,Norah Jones,941.00,526.00,200627,...,-28.576009,-9.370972,-4.624142,0.113758,1.522586,-8.494612,3.642050,2004,1,1
4,7z7fquRFQFXt4Dj7ouWETq,Hey There Delilah,False,0,Hey There Delilah,day,Plain White T's,0.84,661.00,233627,...,-21.289885,-6.442195,-4.644703,-7.456635,-2.027032,-11.075258,5.339601,2007,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79334,3r9K3hDcw6LCQoRkw64toA,Ha Po Zamani,False,0,Pata Pata,year,Miriam Makeba,633.00,538.00,177707,...,-11.973495,-4.551287,4.127084,-3.795970,0.416921,-9.907186,3.189979,1967,1,1
79335,3GoRMfxfqG7M4AD2GPgFe9,Buena suerte,False,0,Revolución,day,Raul Paz,115.00,784.00,219333,...,-18.138713,-9.160653,4.633498,-4.102503,7.200279,-12.955741,-6.845653,2005,5,3
79336,0C6rRCFz4F0b2W67KzjlXj,Farha,False,0,Electric Sufi,day,Dhafer Youssef,785.00,544.00,334042,...,-27.151201,-0.161955,-7.179403,-4.860860,3.993591,-10.775201,0.268727,2001,1,1
79337,1wkZeRlDIBAY7AGluKpvyA,Menina,False,0,Modern World Music,day,Waldemar Bastos,563.00,679.00,238267,...,-22.864194,-12.699123,0.802199,-6.022625,-0.135679,-8.163775,7.866830,2013,10,1


In [16]:
spotify_tracks_pd["track_explicit"] = spotify_tracks_pd["track_explicit"].astype(int)
spotify_tracks_pd

Unnamed: 0,id,track_name,track_explicit,track_popularity,album_name,album_release_date_precision,artist_name,audio_acousticness,audio_danceability,audio_duration_ms,...,timbre5,timbre6,timbre7,timbre8,timbre9,timbre10,timbre11,year,month,day
0,2dLLR6qlu5UJ5gk0dKz0h3,Royals,0,76,Pure Heroine,day,Lorde,121.00,674.00,190185,...,-11.336286,-4.114898,-3.412426,3.759076,3.758684,-17.167142,-9.604632,2013,9,27
1,7AYoXqCtME90flUOpBJM7i,Society,0,0,Music For The Motion Picture Into The Wild,day,Eddie Vedder,896.00,0.64,236307,...,-13.867221,-9.606810,-11.765366,-13.161886,-5.430175,-7.123225,4.989060,2007,12,17
2,2bbhyUWJ5VjdfI3P4PRLu2,Samson,0,47,Begin to Hope (Special Edition),day,Regina Spektor,862.00,0.49,189507,...,-7.751785,-14.671814,-2.853136,-1.024664,-9.525564,-13.040564,1.721167,2006,6,13
3,7zkLpY72g6lKQbiHDqri1S,Sunrise,0,69,Feels Like Home,day,Norah Jones,941.00,526.00,200627,...,-28.576009,-9.370972,-4.624142,0.113758,1.522586,-8.494612,3.642050,2004,1,1
4,7z7fquRFQFXt4Dj7ouWETq,Hey There Delilah,0,0,Hey There Delilah,day,Plain White T's,0.84,661.00,233627,...,-21.289885,-6.442195,-4.644703,-7.456635,-2.027032,-11.075258,5.339601,2007,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79334,3r9K3hDcw6LCQoRkw64toA,Ha Po Zamani,0,0,Pata Pata,year,Miriam Makeba,633.00,538.00,177707,...,-11.973495,-4.551287,4.127084,-3.795970,0.416921,-9.907186,3.189979,1967,1,1
79335,3GoRMfxfqG7M4AD2GPgFe9,Buena suerte,0,0,Revolución,day,Raul Paz,115.00,784.00,219333,...,-18.138713,-9.160653,4.633498,-4.102503,7.200279,-12.955741,-6.845653,2005,5,3
79336,0C6rRCFz4F0b2W67KzjlXj,Farha,0,0,Electric Sufi,day,Dhafer Youssef,785.00,544.00,334042,...,-27.151201,-0.161955,-7.179403,-4.860860,3.993591,-10.775201,0.268727,2001,1,1
79337,1wkZeRlDIBAY7AGluKpvyA,Menina,0,0,Modern World Music,day,Waldemar Bastos,563.00,679.00,238267,...,-22.864194,-12.699123,0.802199,-6.022625,-0.135679,-8.163775,7.866830,2013,10,1


In [17]:
spotify_tracks_pd = spotify_tracks_pd.drop('track_uri', axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('id', axis=1)

In [18]:
spotify_tracks_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79339 entries, 0 to 79338
Data columns (total 47 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   track_name                    79339 non-null  object 
 1   track_explicit                79339 non-null  int64  
 2   track_popularity              79339 non-null  int32  
 3   album_name                    79339 non-null  object 
 4   album_release_date_precision  79339 non-null  object 
 5   artist_name                   79339 non-null  object 
 6   audio_acousticness            79339 non-null  float64
 7   audio_danceability            79339 non-null  float64
 8   audio_duration_ms             79339 non-null  int32  
 9   audio_energy                  79339 non-null  float64
 10  audio_instrumentalness        79339 non-null  float64
 11  audio_key_1                   79339 non-null  int32  
 12  audio_liveness                79339 non-null  float64
 13  a

In [19]:
# Let's define some constants which we will use throughout this notebook
NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []
TARGET_VARIABLE = "track_genre"

for col in spotify_tracks_pd.columns:
  if spotify_tracks_pd.dtypes[col] == "object":
    CATEGORICAL_FEATURES.append(col)
  else:
    NUMERICAL_FEATURES.append(col)

CATEGORICAL_FEATURES.remove(TARGET_VARIABLE)
print("Categorical: ", CATEGORICAL_FEATURES, "\nNumerical: ", NUMERICAL_FEATURES)

Categorical:  ['track_name', 'album_name', 'album_release_date_precision', 'artist_name'] 
Numerical:  ['track_explicit', 'track_popularity', 'audio_acousticness', 'audio_danceability', 'audio_duration_ms', 'audio_energy', 'audio_instrumentalness', 'audio_key_1', 'audio_liveness', 'audio_loudness', 'audio_mode_1', 'audio_speechiness', 'audio_tempo', 'audio_time_signature', 'audio_valence', 'pitch0', 'pitch1', 'pitch2', 'pitch3', 'pitch4', 'pitch5', 'pitch6', 'pitch7', 'pitch8', 'pitch9', 'pitch10', 'pitch11', 'timbre0', 'timbre1', 'timbre2', 'timbre3', 'timbre4', 'timbre5', 'timbre6', 'timbre7', 'timbre8', 'timbre9', 'timbre10', 'timbre11', 'year', 'month', 'day']


In [20]:
spotify_tracks = spark.createDataFrame(spotify_tracks_pd)

In [21]:
spotify_tracks.printSchema()

root
 |-- track_name: string (nullable = true)
 |-- track_explicit: long (nullable = true)
 |-- track_popularity: long (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- audio_acousticness: double (nullable = true)
 |-- audio_danceability: double (nullable = true)
 |-- audio_duration_ms: long (nullable = true)
 |-- audio_energy: double (nullable = true)
 |-- audio_instrumentalness: double (nullable = true)
 |-- audio_key_1: long (nullable = true)
 |-- audio_liveness: double (nullable = true)
 |-- audio_loudness: double (nullable = true)
 |-- audio_mode_1: long (nullable = true)
 |-- audio_speechiness: double (nullable = true)
 |-- audio_tempo: double (nullable = true)
 |-- audio_time_signature: long (nullable = true)
 |-- audio_valence: double (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- pitch0: double (nullable = true)
 |-- pitch1: double (nullable 

In [22]:
# value counts of Batsman_Name column
spotify_tracks.groupBy('album_release_date_precision').count().show()

spotify_tracks.groupBy('track_name').count().show()

spotify_tracks.groupBy('album_name').count().show()

spotify_tracks.groupBy('artist_name').count().show()

+----------------------------+-----+
|album_release_date_precision|count|
+----------------------------+-----+
|                         day|68587|
|                       month|  101|
|                        year|10651|
+----------------------------+-----+

+--------------------+-----+
|          track_name|count|
+--------------------+-----+
|           On My Way|    5|
|            The City|    3|
|           Yori Yori|    2|
|     El Son te Llama|    1|
|Ginger (feat. Wiz...|    1|
|              Heaven|   13|
|         Sabu Yerkoy|    1|
|    Sina (Soumbouya)|    1|
|               Monie|    1|
|I Don't Know You ...|    1|
|Chloe Dancer/Crow...|    3|
|             Banquet|    1|
| Gold on the Ceiling|    1|
|        Miracle Mile|    1|
|Always Where I Ne...|    1|
|Everything You Do...|    1|
| Power of Persuasion|    1|
|              Cirrus|    1|
|            One Half|    1|
|             blossom|    1|
+--------------------+-----+
only showing top 20 rows

+-----------------

In [35]:
# 1. Label Encode target feature 
stage_1= StringIndexer(inputCol=TARGET_VARIABLE, outputCol='label')

# 2. Label Encode Categorical features
stage_2 = [StringIndexer(inputCol=c, outputCol="{0}_index".format(c), handleInvalid="keep") for c in CATEGORICAL_FEATURES]

# 3. OneHot Encode 
# stage_3 = OneHotEncoder(inputCol='album_release_date_precision_index', outputCol='album_release_date_precision_oh', handleInvalid="keep")

# 4. create a vector of all the features required to train the logistic regression model 
# encoded_columns = ['track_name_index', 'album_name_index', 'artist_name_index', 'album_release_date_precision_oh']
# stage_4 = VectorAssembler(inputCols= encoded_columns + NUMERICAL_FEATURES, outputCol='features')

stage_4 = VectorAssembler(inputCols= [indexer.getOutputCol() for indexer in stage_2] + NUMERICAL_FEATURES, outputCol='features')

# 4.a Create the StandardScaler
scaler = StandardScaler(inputCol=stage_4.getOutputCol(), outputCol="std_" + stage_4.getOutputCol(), withStd=True, withMean=True)

# stages = [stage_1] + stage_2 +[stage_3] + [stage_4] + [scaler]
stages = [stage_1] + stage_2 + [stage_4] + [scaler]

In [36]:
#### LOGISTIC REGRESSION

# define stage 5: logistic regression model                          
stage_5_lg = LogisticRegression(featuresCol='std_features',labelCol='label')

# setup the pipeline
logistic_regression_pipeline = Pipeline(stages = stages + [stage_5_lg])

param_grid = ParamGridBuilder()\
  .addGrid(stage_5_lg.regParam, [0.01, 0.1, 1.0]) \
  .addGrid(stage_5_lg.maxIter, [10, 20, 50]) \
  .build()
# other param: .addGrid(stage_4_lg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
cross_val_lg = CrossValidator(estimator=logistic_regression_pipeline,
                              estimatorParamMaps=param_grid,
                              evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                              numFolds=5,
                              collectSubModels=True
                              )
cv_model_lg = cross_val_lg.fit(spotify_tracks)

# transform the data
final_spotify_tracks_lg = cv_model_lg.transform(spotify_tracks)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 33950)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/clientserver.py", line 480, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/dist-packages/py4j/clientserver.py", line 504, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/usr/lib/python3.7/socketserver.py", line 316, in _handle_request

Py4JError: ignored

In [None]:
#### DECISION TREE

# define stage 5: logistic regression model                          
stage_5_dr = DecisionTreeClassifier(featuresCol='std_features',labelCol='label')

# setup the pipeline
decision_tree_pipeline = Pipeline(stages = stages + [stage_5_dr])

param_grid = ParamGridBuilder()\
  .addGrid(stage_5_dr.maxDepth, [3, 5, 8]) \
  .addGrid(stage_5_dr.impurity, ["gini", "entropy"]) \
  .build()
cross_val_dt = CrossValidator(estimator=decision_tree_pipeline,
                              estimatorParamMaps=param_grid,
                              evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                              numFolds=5,
                              collectSubModels=True
                              )
cv_model_dt = cross_val_dt.fit(spotify_tracks)

# transform the data
final_spotify_tracks_dt = cv_model_dt.transform(spotify_tracks)

In [None]:
#### RANDOM FOREST
# define stage 5: logistic regression model                          
stage_5_rf = RandomForestClassifier(featuresCol="std_features", labelCol="label")

# setup the pipeline
random_forest_pipeline = Pipeline(stages = stages+ [stage_5_rf])

param_grid = ParamGridBuilder()\
  .addGrid(stage_5_rf.maxDepth, [3, 5, 8]) \
  .addGrid(stage_5_rf.numTrees, [10, 50, 100]) \
  .build()
cross_val_rf = CrossValidator(estimator=random_forest_pipeline, 
                            estimatorParamMaps=param_grid,
                            evaluator= MulticlassClassificationEvaluator().setMetricName("accuracy") # default = "areaUnderROC", alternatively "areaUnderPR"
                            numFolds=5,
                            collectSubModels=True 
                            )
cv_model_rf = cross_val_rf.fit(spotify_tracks)

# transform the data
final_spotify_tracks_rf = cv_model_rf.transform(spotify_tracks)

In [None]:
final_spotify_tracks_lg.select("features", "prediction", "track_genre_index").show(5)

In [None]:
final_spotify_tracks_dt.select("features", "prediction", "track_genre_index").show(5)

In [None]:
final_spotify_tracks_rf.select("features", "prediction", "track_genre_index").show(5)

# Encode and Train

In [None]:
# This function is responsible to implement the pipeline above for transforming categorical features into numerical ones
def to_numerical(df, numerical_features, categorical_features, target_variable):
  
  # 1. Label Encode target feature 
  stage_1= StringIndexer(inputCol=target_variable, outputCol='label')

  # 2. Label Encode Categorical features
  stage_2 = [StringIndexer(inputCol=c, outputCol="{0}_index".format(c), handleInvalid="keep") for c in categorical_features]

  # 3. OneHot Encode 
  # stage_3 = OneHotEncoder(inputCol='album_release_date_precision_index', outputCol='album_release_date_precision_oh')

  # 4. create a vector of all the features required to train the logistic regression model 
  # encoded_columns = ['track_name_index', 'album_name_index', 'artist_name_index', 'album_release_date_precision_oh']
  # stage_4 = VectorAssembler(inputCols= encoded_columns + numerical_features, outputCol='features')
  stage_4 = VectorAssembler(inputCols= [indexer.getOutputCol() for indexer in stage_2] + numerical_features, outputCol='features')

  # 4.a Create the StandardScaler
  scaler = StandardScaler(inputCol=stage_4.getOutputCol(), outputCol="std_" + stage_4.getOutputCol(), withStd=True, withMean=True)

  # 5. Populate the stages of the pipeline
  # stages = [stage_1] + stage_2 +[stage_3] + [stage_4] + [scaler]
  stages = [stage_1] + stage_2 + [stage_4] + [scaler]

  # 6. Setup the pipeline with the stages above
  pipeline = Pipeline(stages=stages)

  # 7. Transform the input dataframe accordingly
  transformer = pipeline.fit(df)
  df_transformed = transformer.transform(df)

  return df_transformed

In [None]:
# Transform the training set and get back both the transformer and the new dataset
spotify_tracks_encoded = to_numerical(spotify_tracks, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_VARIABLE)

# Select `features` and `label` (i.e., formerly `deposit`) target variable only
spotify_tracks = spotify_tracks_encoded.select(["features", "label"])

RANDOM_SEED = 42
# Randomly split our original dataset `house_df` into 80÷20 for training and test, respectively
train_set, test_set = spotify_tracks.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

# train.show(5, truncate=False)

In [None]:
# This function defines the general pipeline for logistic regression
def logistic_regression_pipeline(train):
  
  stage_5_lg = LogisticRegression(featuresCol='features',labelCol='label')

  logistic_regression_pipeline = Pipeline(stages= [stage_5_lg])

  #### LOGISTIC REGRESSION
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_lg.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(stage_5_lg.maxIter, [10, 20, 50]) \
    .build()
  # other param: .addGrid(stage_4_lg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
  cross_val_lg = CrossValidator(estimator=logistic_regression_pipeline,
                                estimatorParamMaps=param_grid,
                                evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                                numFolds=5,
                                collectSubModels=True
                                )
  cv_model_lg = cross_val_lg.fit(spotify_tracks)

  return cv_model_lg

In [None]:
cv_model = logistic_regression_pipeline(train_set)

# Make predictions on the test set (`cv_model` contains the best model according to the result of k-fold cross validation)
# `test_df` will follow exactly the same pipeline defined above, and already fit to `train_df`
test_predictions = cv_model.transform(test_set)

test_predictions.select("features", "prediction", "label").show(5)

In [None]:
# This function defines the general pipeline for logistic regression
def decision_tree_pipeline(train):
  
  stage_5_dr = LogisticRegression(featuresCol='features',labelCol='label')

  decision_tree_pipeline = Pipeline(stages= [stage_5_dr])

  #### DECISION TREE
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_dr.maxDepth, [3, 5, 8]) \
    .addGrid(stage_5_dr.impurity, ["gini", "entropy"]) \
    .build()
  cross_val_dt = CrossValidator(estimator=decision_tree_pipeline,
                                estimatorParamMaps=param_grid,
                                evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                                numFolds=5,
                                collectSubModels=True
                                )
  cv_model_dt = cross_val_dt.fit(spotify_tracks)

  return cv_model_dt

In [None]:
cv_model = decision_tree_pipeline(train_set)

# Make predictions on the test set (`cv_model` contains the best model according to the result of k-fold cross validation)
# `test_df` will follow exactly the same pipeline defined above, and already fit to `train_df`
test_predictions = cv_model.transform(test_set)

test_predictions.select("features", "prediction", "label").show(5)

In [None]:
# This function defines the general pipeline for logistic regression
def random_forest_pipeline(train):
  
  stage_5_rf = RandomForestClassifier(featuresCol="features", labelCol="label")

  random_forest_pipeline = Pipeline(stages= [stage_5_rf])

  #### RANDOM FOREST
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_rf.maxDepth, [3, 5, 8]) \
    .addGrid(stage_5_rf.numTrees, [10, 50, 100]) \
    .build()
  cross_val_rf = CrossValidator(estimator=random_forest_pipeline, 
                              estimatorParamMaps=param_grid,
                              evaluator= MulticlassClassificationEvaluator().setMetricName("accuracy") # default = "areaUnderROC", alternatively "areaUnderPR"
                              numFolds=5,
                              collectSubModels=True 
                              )
  cv_model_rf = cross_val_rf.fit(spotify_tracks)

  return cv_model_rf

In [None]:
cv_model = random_forest_pipeline(train_set)

# Make predictions on the test set (`cv_model` contains the best model according to the result of k-fold cross validation)
# `test_df` will follow exactly the same pipeline defined above, and already fit to `train_df`
test_predictions = cv_model.transform(test_set)

test_predictions.select("features", "prediction", "label").show(5)

In [None]:
# view some of the columns generated
# final_spotify_tracks.select('features', 'track_genre_index', 'rawPrediction', 'probability', 'prediction').take(10)