# Appendix V: Forward Selection

In [7]:
"""Imports necessary packages"""

import itertools
import math
from typing import Dict, Iterable, List, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

sns.set_style("whitegrid")

In [8]:
def forward_p_vals(input_model_str: str, vars: List[str], data: Iterable) -> Dict[str, float]:
    """Creates a new model for every variable in `vars` such that it contains all variables from `input_model_str` and the new variable. Logs the new model string and the p-value associated with the added variable.

    Args:
        input_model_str (str): a model string as required by statsmodels.api.formula.ols
        vars (List[str]): the list of explanatory variables that can be considered in a new model.
        data (Iterable): the two dimensional data for fitting the new models.

    Returns:
        Dict[str, float]: the dictionary of new model strings (key) and the associated p-values (values) of their added variable.
    """
    result = {}
    for var in vars:
        if input_model_str.endswith("~"):
            model_str = "%s %s" % (input_model_str, var)
        else:
            model_str = "%s + %s" % (input_model_str, var)
        model = sm.formula.ols(formula=model_str, data=data)
        model_fitted = model.fit()
        p_vals = model_fitted.pvalues.to_dict()
        var = var.replace("*", ":")
        for k in p_vals.keys():
            if var+"[" in k:
                var = k
        result[model_str] = p_vals.get(var, 1)
    return result

In [9]:
def print_p_vals_from_models(models: Dict[str, float]) -> None:
    """Prints p values from a dictionary.

    Args:
        models (Dict[str, float]): the dictionary of variables newly added to a model (key) and their associated p-values (values) of the added variable.
    """
    for model, value in models.items():
        print("p-value of %s: %.8f" % (model.split(" ")[-1], value))

In [10]:
def forward_selection(response_var: str, explanatory_vars: List[str], data: Iterable) -> str:
    """Performs the forward selection.

    Args:
        response_var (str): the variable to predict.
        explanatory_vars (List[str]): the list of explanatory vriables that may be used in the model.
        data (Iterable): the two-dimensional data to use for model fitting.

    Returns:
        str: the resulting model string in the format required by statsmodels.api.formula.ols.
    """
    i = 1   
    previous_model = "%s ~" % response_var
    while len(explanatory_vars) > 0:
        print("--- STEP %i ---" % i)
        print("current model: %s" % previous_model)

        models = forward_p_vals(previous_model, explanatory_vars, data)
        print("possible variables:")
        print_p_vals_from_models(models)

        best_next_model = min(models, key=models.get)
        if models[best_next_model] > 0.05:
            print("The minimal p-value is higher than 0.05, returning the previous model")
            return previous_model
        
        previous_model = best_next_model
        explanatory_vars = [var for var in explanatory_vars if var != best_next_model.split(" ")[-1]]
        i += 1

In [11]:
data = pd.read_csv("D:/School/frequentist-statistics/ITM-song-popularity/database/itm_songs_preprocessed.csv")
data = data.drop("Unnamed: 0", axis=1)

In [12]:
explanatory_vars = ["name_len", "track_number", "duration", "acousticness", "danceability", "energy", "loudness", "speechiness", "valence", "tempo", "complexity", "age_days", "mode"]

In [13]:
best_fs_abs = forward_selection("popularity_abs", explanatory_vars, data)
print("The best model for absolute popularity excluding correlations obtained via forward selection is `%s`." % best_fs_abs)

--- STEP 1 ---
current model: popularity_abs ~
possible variables:
p-value of name_len: 0.46054799
p-value of track_number: 0.21061084
p-value of duration: 0.00087687
p-value of acousticness: 0.00576701
p-value of danceability: 0.00328833
p-value of energy: 0.03409356
p-value of loudness: 0.00488552
p-value of speechiness: 0.22617057
p-value of valence: 0.03859374
p-value of tempo: 0.54998284
p-value of complexity: 0.00001846
p-value of age_days: 0.00000017
p-value of mode: 0.91191181
--- STEP 2 ---
current model: popularity_abs ~ age_days
possible variables:
p-value of name_len: 0.91292203
p-value of track_number: 0.04107383
p-value of duration: 0.00014822
p-value of acousticness: 0.00073659
p-value of danceability: 0.00018752
p-value of energy: 0.00038229
p-value of loudness: 0.00000553
p-value of speechiness: 0.23901821
p-value of valence: 0.02374235
p-value of tempo: 0.08657363
p-value of complexity: 0.00000317
p-value of mode: 0.75308241
--- STEP 3 ---
current model: popularity_ab

In [14]:
best_fs_rel = forward_selection("popularity_norm", explanatory_vars, data)
print("The best model for relative popularity excluding correlations obtained via forward selection is `%s`." % best_fs_rel)

--- STEP 1 ---
current model: popularity_norm ~
possible variables:
p-value of name_len: 0.46054799
p-value of track_number: 0.21061084
p-value of duration: 0.00087687
p-value of acousticness: 0.00576701
p-value of danceability: 0.00328833
p-value of energy: 0.03409356
p-value of loudness: 0.00488552
p-value of speechiness: 0.22617057
p-value of valence: 0.03859374
p-value of tempo: 0.54998284
p-value of complexity: 0.00001846
p-value of age_days: 0.00000017
p-value of mode: 0.91191181
--- STEP 2 ---
current model: popularity_norm ~ age_days
possible variables:
p-value of name_len: 0.91292203
p-value of track_number: 0.04107383
p-value of duration: 0.00014822
p-value of acousticness: 0.00073659
p-value of danceability: 0.00018752
p-value of energy: 0.00038229
p-value of loudness: 0.00000553
p-value of speechiness: 0.23901821
p-value of valence: 0.02374235
p-value of tempo: 0.08657363
p-value of complexity: 0.00000317
p-value of mode: 0.75308241
--- STEP 3 ---
current model: popularity_

In [15]:
correlations = ["duration*complexity", "acousticness*energy", "energy*loudness", "track_number*complexity", "track_number*duration", "duration*loudness", "duration*speechiness", "acousticness*loudness", "danceability*valence", "danceability*complexity", "loudness*complexity", "valence*complexity"]
explanatory_vars.extend(correlations)

In [16]:
best_corr_fs_abs = forward_selection("popularity_abs", explanatory_vars, data)
print("The best model for absolute popularity including correlations obtained via forward selection is `%s`." % best_corr_fs_abs)

--- STEP 1 ---
current model: popularity_abs ~
possible variables:
p-value of name_len: 0.46054799
p-value of track_number: 0.21061084
p-value of duration: 0.00087687
p-value of acousticness: 0.00576701
p-value of danceability: 0.00328833
p-value of energy: 0.03409356
p-value of loudness: 0.00488552
p-value of speechiness: 0.22617057
p-value of valence: 0.03859374
p-value of tempo: 0.54998284
p-value of complexity: 0.00001846
p-value of age_days: 0.00000017
p-value of mode: 0.91191181
p-value of duration*complexity: 0.73151876
p-value of acousticness*energy: 0.00982869
p-value of energy*loudness: 0.25318394
p-value of track_number*complexity: 0.31355216
p-value of track_number*duration: 0.15667675
p-value of duration*loudness: 0.29716809
p-value of duration*speechiness: 0.02035317
p-value of acousticness*loudness: 0.46479656
p-value of danceability*valence: 0.10095925
p-value of danceability*complexity: 0.20162847
p-value of loudness*complexity: 0.19865457
p-value of valence*complexity

In [17]:
best_corr_fs_rel = forward_selection("popularity_norm", explanatory_vars, data)
print("The best model for relative popularity including correlations obtained via forward selection is `%s`." % best_corr_fs_rel)

--- STEP 1 ---
current model: popularity_norm ~
possible variables:
p-value of name_len: 0.46054799
p-value of track_number: 0.21061084
p-value of duration: 0.00087687
p-value of acousticness: 0.00576701
p-value of danceability: 0.00328833
p-value of energy: 0.03409356
p-value of loudness: 0.00488552
p-value of speechiness: 0.22617057
p-value of valence: 0.03859374
p-value of tempo: 0.54998284
p-value of complexity: 0.00001846
p-value of age_days: 0.00000017
p-value of mode: 0.91191181
p-value of duration*complexity: 0.73151876
p-value of acousticness*energy: 0.00982869
p-value of energy*loudness: 0.25318394
p-value of track_number*complexity: 0.31355216
p-value of track_number*duration: 0.15667675
p-value of duration*loudness: 0.29716809
p-value of duration*speechiness: 0.02035317
p-value of acousticness*loudness: 0.46479656
p-value of danceability*valence: 0.10095925
p-value of danceability*complexity: 0.20162847
p-value of loudness*complexity: 0.19865457
p-value of valence*complexit