In [1]:
!pip install pyspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1 



In [2]:
from tqdm import tqdm

import requests
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator #, BinaryClassificationEvaluator 

# Basic libreries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pre-processing phase
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Features Importance
from sklearn.inspection import permutation_importance

# Model
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Hyper-Parameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score

# Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
# Create the session
conf = SparkConf().set("spark.ui.port", "4050").set('spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
GDRIVE_DIR = "/content/gdrive" # Your own mount point on Google Drive
GDRIVE_HOME_DIR = GDRIVE_DIR + "/My Drive" # Your own home directory
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR +  "/Sapienza/Primo Anno/Big Data Computing/Project"

# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

DATASET_URL = "https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv"
GDRIVE_DATASET_FILE = GDRIVE_DATA_DIR + "/" + DATASET_URL.split("/")[-1]

Mounted at /content/gdrive


In [6]:
import requests

"""
This function downloads a file from a specific URL directly to Google Drive.
"""
def get_data(dataset_url, dest, chunk_size=1024):
  response = requests.get(dataset_url, stream=True)
  if response.status_code == 200: # Test if everything went ok
    with open(dest, "wb") as file:
      for block in response.iter_content(chunk_size=chunk_size): 
        if block: 
          file.write(block)

In [7]:
print("Retrieving dataset from URL: {} ...".format(DATASET_URL))
get_data(DATASET_URL, GDRIVE_DATASET_FILE)
print("Dataset successfully retrieved and stored at: {}".format(GDRIVE_DATASET_FILE))

Retrieving dataset from URL: https://raw.githubusercontent.com/AndreaBe99/big-data-project/main/data/dataframe.csv ...


KeyboardInterrupt: ignored

In [None]:
spotify_tracks = spark.read.load(GDRIVE_DATASET_FILE, 
                           format="csv", 
                           sep=";", 
                           inferSchema="true", 
                           header="true")

### **Check the shape of the loaded dataset, i.e., number of rows and columns**

In [None]:
print("The shape of the dataset is {:d} rows by {:d} columns".format(spotify_tracks.count(), len(spotify_tracks.columns)))

### **Print out the schema of the loaded dataset**

In [None]:
spotify_tracks.printSchema()

In [None]:
spotify_tracks.show(5)

In [None]:
spotify_tracks_pd = spotify_tracks.toPandas()

spotify_tracks_pd['audio_avg_pitches'] = spotify_tracks_pd['audio_avg_pitches'].str.strip('][').str.split(', ')
spotify_tracks_pd['audio_avg_timbre'] = spotify_tracks_pd['audio_avg_timbre'].str.strip('][').str.split(', ')

In [None]:
split_pitch = pd.DataFrame(spotify_tracks_pd['audio_avg_pitches'].tolist(), columns=["pitch" + str(i) for i in range(12)])
split_pitch = split_pitch.astype(float)

split_timbre = pd.DataFrame(spotify_tracks_pd['audio_avg_timbre'].tolist(), columns=["timbre" + str(i) for i in range(12)])
split_timbre = split_timbre.astype(float)

In [None]:
spotify_tracks_pd = pd.concat([spotify_tracks_pd, split_pitch], axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('audio_avg_pitches', axis=1)

spotify_tracks_pd = pd.concat([spotify_tracks_pd, split_timbre], axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('audio_avg_timbre', axis=1)

spotify_tracks_pd

In [None]:
spotify_tracks_pd['album_release_date'] = pd.to_datetime(spotify_tracks_pd['album_release_date'])
spotify_tracks_pd['year']= spotify_tracks_pd['album_release_date'].dt.year
spotify_tracks_pd['month']= spotify_tracks_pd['album_release_date'].dt.month
spotify_tracks_pd['day']= spotify_tracks_pd['album_release_date'].dt.day

spotify_tracks_pd = spotify_tracks_pd.drop('album_release_date', axis=1)

spotify_tracks_pd

In [None]:
spotify_tracks_pd["track_explicit"] = spotify_tracks_pd["track_explicit"].astype(int)
spotify_tracks_pd

In [None]:
spotify_tracks_pd = spotify_tracks_pd.drop('track_uri', axis=1)
spotify_tracks_pd = spotify_tracks_pd.drop('id', axis=1)

In [None]:
spotify_tracks_pd.info()

In [None]:
# Let's define some constants which we will use throughout this notebook
NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []
TARGET_VARIABLE = "track_genre"

for col in spotify_tracks_pd.columns:
  if spotify_tracks_pd.dtypes[col] == "object":
    CATEGORICAL_FEATURES.append(col)
  else:
    NUMERICAL_FEATURES.append(col)

CATEGORICAL_FEATURES.remove(TARGET_VARIABLE)
print("Categorical: ", CATEGORICAL_FEATURES, "\nNumerical: ", NUMERICAL_FEATURES)

In [None]:
spotify_tracks = spark.createDataFrame(spotify_tracks_pd)

In [None]:
spotify_tracks.printSchema()

In [None]:
# value counts of Batsman_Name column
spotify_tracks.groupBy('album_release_date_precision').count().show()

spotify_tracks.groupBy('track_name').count().show()

spotify_tracks.groupBy('album_name').count().show()

spotify_tracks.groupBy('artist_name').count().show()

In [None]:
RANDOM_SEED = 42
# Randomly split our original dataset `house_df` into 80÷20 for training and test, respectively
train_set, test_set = spotify_tracks.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

In [None]:
# This function is responsible to implement the pipeline above for transforming categorical features into numerical ones
def to_numerical(df, numerical_features, categorical_features, target_variable):
  
  # 1. Label Encode target feature 
  stage_1= StringIndexer(inputCol=target_variable, outputCol='label')

  # 2. Label Encode Categorical features
  stage_2 = [StringIndexer(inputCol=c, outputCol="{0}_index".format(c), handleInvalid="keep") for c in categorical_features]

  # 3. OneHot Encode 
  stage_3 = OneHotEncoder(inputCols=['album_release_date_precision_index'], outputCols=['album_release_date_precision_oh'])

  # 4. create a vector of all the features required to train the logistic regression model 
  encoded_columns = ['track_name_index', 'album_name_index', 'artist_name_index', 'album_release_date_precision_oh']
  stage_4 = VectorAssembler(inputCols= encoded_columns + numerical_features, outputCol='features')

  # 4.a Create the StandardScaler
  scaler = StandardScaler(inputCol=stage_4.getOutputCol(), outputCol="std_"+assembler.getOutputCol(), withStd=True, withMean=True)

  # 5. Populate the stages of the pipeline
  stages = [stage_1] + stage_2 +[stage_3] + [stage_4] + [scaler]

  # 6. Setup the pipeline with the stages above
  pipeline = Pipeline(stages=stages)

  # 7. Transform the input dataframe accordingly
  transformer = pipeline.fit(df)
  df_transformed = transformer.transform(df)

  return df_transformed

In [None]:
# Transform the training set and get back both the transformer and the new dataset
train_set_encoded = to_numerical(train_set, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_VARIABLE)

# Select `features` and `label` (i.e., formerly `deposit`) target variable only
train = train_set_encoded.select(["features", "label"])

train.show(5, truncate=False)

In [None]:
# This function defines the general pipeline for logistic regression
def logistic_regression_pipeline(train, 
                                 numerical_features, 
                                 categorical_features, 
                                 target_variable, 
                                 with_std=True,
                                 with_mean=True,
                                 k_fold=5):
  
  stage_5_lg = LogisticRegression(featuresCol='features',labelCol='label')

  logistic_regression_pipeline = Pipeline(stages= [stage_5_lg])

  #### LOGISTIC REGRESSION
  param_grid = ParamGridBuilder()\
    .addGrid(stage_5_lg.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(stage_5_lg.maxIter, [10, 20, 50]) \
    .build()
  # other param: .addGrid(stage_4_lg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
  cross_val_lg = CrossValidator(estimator=logistic_regression_pipeline,
                                estimatorParamMaps=param_grid,
                                evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                                numFolds=5,
                                collectSubModels=True
                                )
  cv_model_lg = cross_val_lg.fit(spotify_tracks)

  return cv_model_lg

In [None]:
cv_model = logistic_regression_pipeline(train, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_VARIABLE)

# Make predictions on the test set (`cv_model` contains the best model according to the result of k-fold cross validation)
# `test_df` will follow exactly the same pipeline defined above, and already fit to `train_df`
test_predictions = cv_model.transform(test_set)

test_predictions.select("features", "prediction", "label").show(5)

In [None]:
# define stage 5: logistic regression model                          
stage_5_lg = LogisticRegression(featuresCol='features',labelCol='label')
stage_5_dr = DecisionTreeClassifier(featuresCol='features',labelCol='label')
stage_5_rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# setup the pipeline
logistic_regression_pipeline = Pipeline(stages= [stage_1] + stage_2 +[stage_3] + [stage_4] + [stage_5_lg])
decision_tree_pipeline = Pipeline(stages= [stage_1] + stage_2 +[stage_3] + [stage_4] + [stage_5_dr])
random_forest_pipeline = Pipeline(stages= [stage_1] + stage_2 +[stage_3] + [stage_4] + [stage_5_rf])

In [None]:
#### LOGISTIC REGRESSION
param_grid = ParamGridBuilder()\
  .addGrid(stage_5_lg.regParam, [0.01, 0.1, 1.0]) \
  .addGrid(stage_5_lg.maxIter, [10, 20, 50]) \
  .build()
# other param: .addGrid(stage_4_lg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
cross_val_lg = CrossValidator(estimator=logistic_regression_pipeline,
                              estimatorParamMaps=param_grid,
                              evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                              numFolds=5,
                              collectSubModels=True
                              )
cv_model_lg = cross_val_lg.fit(spotify_tracks)

# transform the data
final_spotify_tracks_lg = cv_model_lg.transform(spotify_tracks)

In [None]:
#### DECISION TREE
param_grid = ParamGridBuilder()\
  .addGrid(stage_5_dr.maxDepth, [3, 5, 8]) \
  .addGrid(stage_5_dr.impurity, ["gini", "entropy"]) \
  .build()
cross_val_dt = CrossValidator(estimator=decision_tree_pipeline,
                              estimatorParamMaps=param_grid,
                              evaluator=MulticlassClassificationEvaluator().setMetricName("accuracy"), # default = "areaUnderROC", alternatively "areaUnderPR"
                              numFolds=5,
                              collectSubModels=True
                              )
cv_model_dt = cross_val_dt.fit(spotify_tracks)

# transform the data
final_spotify_tracks_dt = cv_model_dt.transform(spotify_tracks)

In [None]:
#### RANDOM FOREST
param_grid = ParamGridBuilder()\
  .addGrid(stage_5_rf.maxDepth, [3, 5, 8]) \
  .addGrid(stage_5_rf.numTrees, [10, 50, 100]) \
  .build()
cross_val_rf = CrossValidator(estimator=random_forest_pipeline, 
                            estimatorParamMaps=param_grid,
                            evaluator= MulticlassClassificationEvaluator().setMetricName("accuracy") # default = "areaUnderROC", alternatively "areaUnderPR"
                            numFolds=5,
                            collectSubModels=True 
                            )
cv_model_rf = cross_val_rf.fit(spotify_tracks)

# transform the data
final_spotify_tracks_rf = cv_model_rf.transform(spotify_tracks)

In [None]:
final_spotify_tracks_lg.select("features", "prediction", "track_genre_index").show(5)

In [None]:
final_spotify_tracks_dt.select("features", "prediction", "track_genre_index").show(5)

In [None]:
final_spotify_tracks_rf.select("features", "prediction", "track_genre_index").show(5)

In [None]:
# view some of the columns generated
# final_spotify_tracks.select('features', 'track_genre_index', 'rawPrediction', 'probability', 'prediction').take(10)