In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import warnings
warnings.simplefilter(action="ignore")
# We are required to do this in order to avoid "FutureWarning" issues.
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA, ARMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

%matplotlib inline

import pickle 

# Reading in data 

#### Importing original dataframes 

In [3]:
italy_17_19 = pd.read_pickle('../data/it_17_19_v50_feat.pkl')
spain_17_19 = pd.read_pickle('../data/sp_17_19_v50_feat.pkl')
greece_17_19 = pd.read_pickle('../data/gr_17_19_v50_feat.pkl')

italy_20_21 = pd.read_pickle('../data/it_20_v50_feat.pkl')
spain_20_21 = pd.read_pickle('../data/sp_20_v50_feat.pkl')
greece_20_21 = pd.read_pickle('../data/gr_20_v50_feat.pkl')

In [8]:
italy_21 = pd.read_pickle('../data/it_21_v50_feat.pkl')

In [10]:
italy_21.head(2)

Unnamed: 0_level_0,Position,Track Name,Artist,region,spotify_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-01-01,1,Problemas,Paris Boy,it,2aQJOc2QUTdQl1J2Z9VxYO,0.787,0.238,1,-13.154,0,0.0554,0.677,2.2e-05,0.0869,0.262,98.022,222892,4
2021-01-01,2,fools (can't help falling in love) (feat. Sody),Foster,it,4VEEDnEFLI9dUy5QA51rom,0.706,0.604,2,-6.932,1,0.303,0.418,0.0,0.242,0.361,82.03,165029,4


In [11]:
italy_21.tail(2)

Unnamed: 0_level_0,Position,Track Name,Artist,region,spotify_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-02-20,49,Un sorriso dentro al pianto,Ornella Vanoni,it,1SWwL27YCzUn4Cab9NHfn8,0.607,0.45,2,-7.978,1,0.0356,0.516,1.9e-05,0.114,0.237,119.642,223973,4
2021-02-20,50,Yellow Sweater,Dani,it,1Mv2VQTlPQSHBTgZbYAYSh,0.708,0.671,0,-4.899,1,0.0327,0.0171,2.2e-05,0.23,0.368,105.014,220857,4


#### Reading in resampled dataframes by year 

In [4]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/capstone/code'

In [5]:
it_rw_17 = pd.read_pickle("../data/resampled/it_rw_17.pkl")
it_rw_18 = pd.read_pickle("../data/resampled/it_rw_18.pkl")
it_rw_19 = pd.read_pickle("../data/resampled/it_rw_19.pkl")
it_rw_20 = pd.read_pickle("../data/resampled/it_rw_20.pkl")

In [6]:
it_rw_20.tail(2)

Unnamed: 0_level_0,Position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-12-27,26.111429,0.65854,0.601163,5.225714,-7.301663,0.62,0.0999,0.313663,0.036708,0.155573,0.494371,117.747651,190622.557143,3.894286
2021-01-03,25.948571,0.663151,0.618714,5.782857,-7.345991,0.582857,0.104829,0.309551,0.028036,0.173735,0.492659,115.9257,194482.251429,3.942857


#### Resampling 2021 DF (was not pre-processed in other notebooks)

In [12]:
it_rw_21 = italy_21.resample("W").mean()

In [13]:
it_rw_21.head()

Unnamed: 0_level_0,Position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-01-03,25.5,0.67152,0.6323,5.886667,-7.23908,0.546667,0.106519,0.293732,0.024474,0.166094,0.482812,114.97222,199451.313333,3.946667
2021-01-10,25.5,0.647797,0.595083,5.602857,-7.735671,0.611429,0.120629,0.333332,0.021076,0.171388,0.481609,114.757686,190786.811429,3.931429
2021-01-17,25.5,0.648654,0.575126,5.777143,-7.367766,0.591429,0.110776,0.358029,0.019525,0.186532,0.474283,115.710431,185596.828571,3.954286
2021-01-24,25.5,0.657583,0.601891,5.514286,-7.057797,0.622857,0.112212,0.363183,0.01911,0.216798,0.503386,117.861894,184245.968571,3.96
2021-01-31,25.5,0.658066,0.605014,5.222857,-6.992654,0.648571,0.110351,0.341578,0.037043,0.217935,0.505154,116.264131,185354.448571,3.96


# Importing needed functions 

_Function to create, fit, predict, and plot with ARIMA_ 

In [14]:
def arima_predict_plot(df, feature, year, param_df, title='title', figsize=(15,5), order=None, d=None, ci=True):
  
    # create train and test sets
    n_rows = round(len(df)*0.9)
    train = df[feature][0:n_rows]
    test = df[feature][n_rows:]
    
    # find ndiffs for stationarity from ndiff dataframe
    if d is None: 
        d = ndiff_df.loc[ndiff_df['audio_feature'] == feature, 'ndiffs for stationarity'].iloc[0]
    print(f'd = {d}')
    
    if order is None:
        # find order from arima parameters dataframe 
        order = param_df.loc[param_df['audio_feature'] == feature, 'order'].iloc[0]
    print(f'order = {order}')
   
    try: 
        # instantiate ARIMA model
        model = ARIMA(train, order=order)

        # fit ARIMA model
        arima = model.fit()

        # get predictions for train and test sets 
        preds_train = model.predict(params=arima.params, start=train.index[d], end=train.index[-1], typ='levels')
        preds_test = model.predict(params=arima.params, start=test.index[0], end=test.index[-1], typ='levels')

        # calculate and print RMSE for train and test setes 
        train_rmse = mean_squared_error(train[d::], preds_train)**0.5
        print(f'{feature.capitalize()} train RMSE ({year}) - ARIMA({order}): {train_rmse}')

        test_rmse = mean_squared_error(test, preds_test)**0.5
        print(f'{feature.capitalize()} test RMSE ({year}) - ARIMA({order}): {test_rmse}')

        # add RMSEs to arima parameters dataframe 
        param_df.loc[param_df['audio_feature'] == feature, 'arima_train_rmse'] = train_rmse    
        param_df.loc[param_df['audio_feature'] == feature, 'arima_test_rmse'] = test_rmse

          # set up plot
        plt.figure(figsize=figsize)

        # plot training data 
        plt.plot(train, color='blue')

        # plot testing data 
        plt.plot(test.index, test, color='orange')

        # plot predicted values for test set 
        plt.plot(test.index, preds_test, color='green')

        # add line for the baseline model (mean value of feature)
        plt.hlines(df[feature].mean(), train.index[0], test.index[-1], color = 'grey')

        # plot confidence interval
        if ci:
            ci = 1.96 * np.std(preds_test)/np.mean(preds_test)
            plt.fill_between(test.index, (preds_test - ci), (preds_test + ci), color='blue', alpha=.1) 

        # make plot with title! 
        plt.title(title, fontsize=16)
        plt.show() ; 
        
    except:
        print(ValueError)
        pass

_More elemental function just to instantiate and fit ARIMA model._

_Function just to plot train/test/predictions for any model._ 

In [15]:
def arima_plot(train, test, preds_test, title='title', figsize=(15,5), ci=True):
    
    # set up plot
    plt.figure(figsize=figsize)
    
    # plot training data 
    plt.plot(train, color='blue')
    
    # plot testing data 
    plt.plot(test.index, test, color='orange')
    
    # plot predicted values for test set 
    plt.plot(test.index, preds_test, color='green')
    
    # add line for the baseline model (mean value of feature)
    # plt.hlines(df[feature].mean(), train.index[0], test.index[-1], color = 'grey')
    
    # plot confidence interval 
    if ci:
        ci = 1.96 * np.std(preds_test)/np.mean(preds_test)
        plt.fill_between(test.index, (preds_test - ci), (preds_test + ci), color='blue', alpha=.1) 
    
    # make plot with title! 
    plt.title(title, fontsize=16)
    plt.show() ; 

_Function to create, fit, predict, and plot SARIMA model with `seasonal_order` passed in but **no** exogenous features included._

In [None]:
def sarima_predict_plot_seasonal(df, feature, year, param_df, title='title', figsize=(15,5), order=None, d=None, seasonal_order=None, ci=True):
  
    # create train and test sets
    n_rows = round(len(df)*0.9)
    train = df[feature][0:n_rows]
    test = df[feature][n_rows:]
    
    # find ndiffs for stationarity from ndiff dataframe
    if d is None: 
        d = ndiff_df.loc[ndiff_df['audio_feature'] == feature, 'ndiffs for stationarity'].iloc[0]
    print(f'd = {d}')
    
    # find order from arima parameters dataframe 
    if order is None:
        order = param_df.loc[param_df['audio_feature'] == feature, 'order'].iloc[0]
    print(f'order = {order}')
    
    # find seasonal order from arima parameters dataframe 
    if seasonal_order is None: 
        sea_string = param_df.loc[param_df['audio_feature'] == feature, 'seasonal_order'].iloc[0]
        seasonal_order = tuple(map(int, sea_string.split(', '))) 
    print(f'seasonal order = {seasonal_order}')
    
    try: 

        # instantiate and fit SARIMAX model 
        sarima = SARIMAX(endog=train, order=order, seasonal_order=seasonal_order).fit()

        # get predictions for train and test sets 
        preds_train = sarima.predict(start=train.index[d], end=train.index[-1], typ='levels')
        preds_test = sarima.predict(start=test.index[0], end=test.index[-1], typ='levels')

        # calculate and print RMSE for train and test setes 
        train_rmse = mean_squared_error(train[d::], preds_train)**0.5
        print(f'{feature.capitalize()} train RMSE ({year}) - SARIMA({order}): {train_rmse}')

        test_rmse = mean_squared_error(test, preds_test)**0.5
        print(f'{feature.capitalize()} test RMSE ({year}) - SARIMA({order}): {test_rmse}')    

        # add RMSEs to arima parameters dataframe 
        param_df.loc[param_df['audio_feature'] == feature, 'sarima_train_rmse'] = train_rmse    
        param_df.loc[param_df['audio_feature'] == feature, 'sarima_test_rmse'] = test_rmse

        # calculate residuals
        # residuals = test - preds_test

        # set up plot
        plt.figure(figsize=figsize)

        # plot training data 
        plt.plot(train, color='blue')

        # plot testing data 
        plt.plot(test.index, test, color='orange')

        # plot predicted values for test set 
        plt.plot(test.index, preds_test, color='green')

        # add line for the baseline model (mean value of feature)
        plt.hlines(df[feature].mean(), train.index[0], test.index[-1], color = 'grey')

        # plot confidence interval 
        if ci:
            ci = 1.96 * np.std(preds_test)/np.mean(preds_test)
            plt.fill_between(test.index, (preds_test - ci), (preds_test + ci), color='blue', alpha=.1) 

        # make plot with title! 
        plt.title(title, fontsize=16)
        plt.show() ; 
        
    
    except ValueError as ve:
        print(ve)
        pass

_Function to create, fit, predict, and plot SARIMAX model with `seasonal_order` and `exogenous_variables'._

In [None]:
def sarima_predict_plot_exog(df, feature, year, param_df, exog_var, title='title', figsize=(15,5), order=None, d=None, seasonal_order=None, ci=True):

    # find ndiffs for stationarity from ndiff dataframe
    if d is None: 
        d = ndiff_df.loc[ndiff_df['audio_feature'] == feature, 'ndiffs for stationarity'].iloc[0]
    print(f'd = {d}')
    
    # find order from arima parameters dataframe 
    if order is None:
        order = param_df.loc[param_df['audio_feature'] == feature, 'order'].iloc[0]
    print(f'order = {order}')
    
    # find seasonal order from arima parameters dataframe 
    if seasonal_order is None: 
        sea_string = param_df.loc[param_df['audio_feature'] == feature, 'seasonal_order'].iloc[0]
        seasonal_order = tuple(map(int, sea_string.split(', '))) 
    print(f'seasonal order = {seasonal_order}')
    
    # reshape exogenous features to pass to the model 
    exog = df.loc[:, exog_var]   
   
    # create train and test sets
    n_rows = round(len(df)*0.9)
    train = df[feature][0:n_rows]
    test = df[feature][n_rows:]
  
    try:
        # instantiate and fit SARIMAX model 
        sarima = SARIMAX(endog=train, exog=exog[0:n_rows], order=order, seasonal_order=seasonal_order).fit()

        # get predictions for train and test sets 
        preds_train = sarima.predict(start=train.index[d], end=train.index[-1], typ='levels', exog=exog[0:n_rows])
        preds_test = sarima.predict(start=test.index[0], end=test.index[-1], typ='levels', exog=exog[n_rows:])

        # calculate and print RMSE for train and test setes 
        train_rmse = mean_squared_error(train[d::], preds_train)**0.5
        print(f'{feature.capitalize()} train RMSE ({year}) - SARIMAX({seasonal_order}) w/ exogenous variables: {train_rmse}')

        test_rmse = mean_squared_error(test, preds_test)**0.5
        print(f'{feature.capitalize()} test RMSE ({year}) - SARIMAX({seasonal_order}) w/ exogenous variables: {test_rmse}')    

        # add RMSEs to arima parameters dataframe 
        param_df.loc[param_df['audio_feature'] == feature, 'exog_train_rmse'] = train_rmse    
        param_df.loc[param_df['audio_feature'] == feature, 'exog_test_rmse'] = test_rmse

        # calculate residuals
        # residuals = test - preds_test

        # set up plot
        plt.figure(figsize=figsize)

        # plot training data 
        plt.plot(train, color='blue')

        # plot testing data 
        plt.plot(test.index, test, color='orange')

        # plot predicted values for test set 
        plt.plot(test.index, preds_test, color='green')

        # add line for the baseline model (mean value of feature)
        plt.hlines(df[feature].mean(), train.index[0], test.index[-1], color = 'grey')

        # plot confidence interval 
        if ci:
            ci = 1.96 * np.std(preds_test)/np.mean(preds_test)
            plt.fill_between(test.index, (preds_test - ci), (preds_test + ci), color='blue', alpha=.1) 

        # make plot with title! 
        plt.title(title, fontsize=16)
        plt.show() ; 
        
    except ValueError as ve:
        print(ve)
        pass

_Identifying exogenous variables for each of the five main features for use in the `sarima_predict_plot_exog` function. In this case, the exogenous variables for each audio feature are the other four audio features._

In [16]:
five_features = ['danceability', 'mode', 'acousticness', 'valence', 'energy']

exog_danceability = ['mode', 'acousticness', 'valence', 'energy']
exog_mode = ['danceability', 'acousticness', 'valence', 'energy']
exog_acousticness = ['danceability', 'mode', 'valence', 'energy']
exog_valence = ['danceability', 'mode', 'acousticness', 'energy']
exog_energy = ['danceability', 'mode', 'acousticness', 'valence']