### Loading libraries

In [None]:
library(tidyverse)
library(tidymodels)
library(gridExtra)

### Utility functions

In [None]:
fig <- function(width, heigth){
     options(repr.plot.width = width, repr.plot.height = heigth)
}

### Loading data

In [None]:
path <- ""

names <- read.table(paste(path, "spotify-names.txt", sep = ""),header = TRUE)
songs <- read.table(paste(path, "spotify-extr.txt", sep = ""),
                    sep = " ", header = TRUE) %>%
    as_tibble() %>%
    mutate(name = names$x,
           key = factor(key),
           mode = factor(mode),
           pop.class = factor(pop.class)) %>%
    relocate(c(pop.class, popularity)) %>%
    relocate(c(mode, key, name), .after=last_col())

songs.quant <- songs %>% select(popularity:tempo)

head(songs)

## Exploratory statistics

In [None]:
fig(12,8)
songs.quant %>%
    mutate(duration=log(duration)) %>%
    pivot_longer(cols=everything(), names_to='variable', values_to='value') %>%
ggplot() +
    geom_histogram(aes(value), fill='#2FD565', color='#000000', bins=30) +
    facet_wrap(~variable, scales='free')

In [None]:
fig(12,8)
songs.quant %>%
    mutate(duration=log(duration)) %>%
    pivot_longer(cols=!popularity, names_to='variable', values_to='value') %>%
ggplot() +
    geom_point(aes(value, popularity), size=.7, alpha=.3) + 
    facet_wrap(~variable, scales='free')

In [None]:
library(corrplot)
cormat <- cor(songs.quant)
corrplot(cormat, method="ellipse")

## Principal component analysis

In [None]:
library(FactoMineR)
library(factoextra)
res.pca <- songs.quant %>% 
    select(!popularity) %>% 
    PCA(ncp=11)

In [None]:
fig(12,4)
g1 <- fviz_eig(res.pca, addlabels=TRUE, ncp=11)

g2 <- tibble(pc=c(1:11), eig=res.pca$eig[1:11,3]) %>%
ggplot(aes(pc, eig)) + 
    geom_bar(stat="identity", fill='#4984B3') +
    geom_line() + 
    geom_point() +
    scale_x_continuous(breaks=c(1:11), minor_breaks=NULL) +
    labs(x='', y='Percentage of variance', title='Cumulative percentage of variance') +
    theme_minimal()

grid.arrange(g1, g2, nrow=1)

In [None]:
fig(8,6)
fviz_pca_var(res.pca, col.var="contrib") +
    scale_color_gradient(low="black", high="green")

In [None]:
name_func <- function(name) {
    paste('PC', substring(name, first=5), sep='')
}

pca.ind <- as_tibble(res.pca$ind$coord) %>%
    rename_with(name_func, everything())

fig(10,8)
ggplot(pca.ind) + 
    geom_point(aes(x=PC1, y=PC2, col=songs$pop.class), alpha=.5) +
    scale_y_continuous(limits=c(-5,5)) +
    labs(color="Popularity class")

# Regression models

In [None]:
data_split <- songs %>%
    select(c(pop.class, valence:tempo)) %>%
    initial_split(strata = pop.class, prop = 0.75)

songs_train <- training(data_split)
songs_test <- testing(data_split)

### Linear model

In [None]:
lin_mod <- glm(pop.class ~ ., data=songs_train, family='binomial')
lin_mod

In [None]:
predict(lin_mod, songs_test)[1:10]
songs_test$pop.class[1:10]