## Data preprocessing

In [None]:
# Chargement des librairies nécessaires
library(ggplot2)
library(tidyverse)
library(gridExtra)
library(GGally)
library(plotly)
library(corrplot)
library(reshape2)
library(FactoMineR) 
library(factoextra)
library(glmnet) 
library(ggfortify)
library(pROC)
library(ROCR)

In [None]:
# Lecture des données
path <- "data/"
song <- read.csv(paste0(path, "spotify_songs.csv"), header = TRUE, sep = ",")

# Premières lignes du jeu de données
head(song)

# Vérification du contenu
summary(song)

In [None]:
# Check the data types
str(song)

In [None]:
# Drop the track_id, track_album_id, playlist_id columns
song <- song[, -c(1, 5, 9)]

# As factor the categorical variables track_artist, playlist_genre, playlist_subgenre, key, mode, playlist_name, track_album_name
song$playlist_name <- as.factor(song$playlist_name)
song$track_album_name <- as.factor(song$track_album_name)
song$track_artist <- as.factor(song$track_artist)
song$playlist_genre <- as.factor(song$playlist_genre)
song$playlist_subgenre <- as.factor(song$playlist_subgenre)
song$key <- factor(song$key, levels = c(-1, 0:11), labels = c("No key detected", "C", "C♯/D♭", "D", "D♯/E♭", "E", "F", "F♯/G♭", "G", "G♯/A♭", "A", "A♯/B♭", "B"))
song$mode <- factor(song$mode, levels = c(0, 1), labels = c("minor", "major"))

# track_album_release_date to date (if the full date is not available, we will use the first day of the year)
song$track_album_release_date <- as.Date(ifelse(nchar(song$track_album_release_date) != 10, 
                                                paste0(substr(song$track_album_release_date, 1, 4), "-01-01"), 
                                                song$track_album_release_date), 
                                         format = "%Y-%m-%d")

# Convert the duration_ms to seconds and rename the column to duration_s
song$duration_s <- song$duration_ms / 1000
song$duration_ms <- NULL

# Check the modified dataset
summary(song)
head(song)

In [None]:
# Check for missing values
colSums(is.na(song))

# Drop the missing values
song <- na.omit(song)

# Check the modified dataset
colSums(is.na(song))

str(song)

# Show the number of data
nrow(song)

In [None]:
# Check for duplicated data
cat(sum(duplicated(song)),'\n')

# Check for duplicated song names with same artist
cat(sum(duplicated(song[, c("track_name", "track_artist")])), '\n')

In [None]:
# New data set by removing columns playlist_name, playlist_genre, playlist_subgenre
onlysongs <- song[, -c(6, 7, 8)]

# Check for duplicated data
cat(sum(duplicated(onlysongs)), '\n')

# Drop duplicates
onlysongs <- unique(onlysongs)

# Print number of songs with same name and same artist
cat(sum(duplicated(onlysongs[, c("track_name", "track_artist")])), '\n')

# print rows with name "Something real"
onlysongs[onlysongs$track_name == "Something Real", ]

# Remove duplicates with the same name and artist, keeping only one with the highest popularity
onlysongs <- onlysongs %>%
    group_by(track_name, track_artist) %>%
    filter(track_popularity == max(track_popularity)) %>%
    slice(1) %>%  # If there are ties in popularity, keep just one
    ungroup()

# print rows with name "Something real"
onlysongs[onlysongs$track_name == "Something Real", ]

# Number of value in onlysongs
cat(nrow(onlysongs), '\n')

Descriptive Analysis

In [None]:
summary(onlysongs)

In [None]:
#Affichage d'histogrammes représentant la distribution de la popularité des chansons, danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_s, avec la possibilité de modifier la taille des titres et du texte des axes
options(repr.plot.width=15, repr.plot.height=15)
par(mfrow=c(4, 3))
hist(onlysongs$track_popularity, main="Popularité des chansons", xlab="Popularité", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$danceability, main="Danceability", xlab="Danceability", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$energy, main="Energy", xlab="Energy", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$loudness, main="Loudness", xlab="Loudness", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$speechiness, main="Speechiness", xlab="Speechiness", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$acousticness, main="Acousticness", xlab="Acousticness", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$instrumentalness, main="Instrumentalness", xlab="Instrumentalness", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$liveness, main="Liveness", xlab="Liveness", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$valence, main="Valence", xlab="Valence", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$tempo, main="Tempo", xlab="Tempo", ylab="Nombre de chansons", col="lightblue", border="black")
hist(onlysongs$duration_s, main="Durée des chansons", xlab="Durée (s)", ylab="Nombre de chansons", col="lightblue", border="black")

In [None]:
options(repr.plot.width=15, repr.plot.height=10)
par(mfrow=c(2, 3))

# Histogramme pour la variable 'mode'
mode_counts <- table(onlysongs$mode)
barplot(
  mode_counts,
  main = "Distribution de la variable 'mode'",
  xlab = "Mode",
  ylab = "Fréquence",
  col = "lightblue",
  border = "black",
  las = 1 # Labels horizontaux
)

# Histogramme pour la variable 'key'
key_counts <- table(onlysongs$key)
barplot(
  key_counts,
  main = "Distribution de la variable 'key'",
  xlab = "Key",
  ylab = "Fréquence",
  col = "lightblue",
  border = "black",
  las = 1 # Labels horizontaux
)
# Conversion de la variable 'track_album_release' en format Date
onlysongs$track_album_release <- as.Date(onlysongs$track_album_release)

# Extraction de l'année de sortie
onlysongs$release_year <- format(onlysongs$track_album_release, "%Y")

# Comptage des occurrences par année
release_year_counts <- table(onlysongs$release_year)

# Création de l'histogramme
barplot(
  release_year_counts,
  main = "Distribution des années de sortie des albums",
  xlab = "Année de sortie",
  ylab = "Nombre de morceaux",
  col = "lightblue",
  border = "black",
  las = 2 # Rotation des labels pour les rendre lisibles
)

# Histogramme pour la variable 'playlist_genre'
genre_counts <- table(song$playlist_genre)
barplot(
  genre_counts,
  main = "Distribution de la variable 'playlist_genre'",
  xlab = "Genre de la playlist",
  ylab = "Fréquence",
  col = "lightblue",
  border = "black",
  las = 2 # Rotation des labels pour les rendre lisibles
)

# Histogramme pour la variable 'playlist_subgenre'
subgenre_counts <- table(song$playlist_subgenre)
barplot(
  subgenre_counts,
  main = "Distribution de la variable 'playlist_subgenre'",
  xlab = "Sous-genre de la playlist",
  ylab = "Fréquence",
  col = "lightblue",
  border = "black",
  las = 2 # Rotation des labels pour les rendre lisibles
)

In [None]:
# Création des catégories pour la variable 'speechiness'
onlysongs$speechiness_category <- cut(
  onlysongs$speechiness,
  breaks = c(0, 0.33, 0.66, 1),
  labels = c("Music", "Speech and music", "Speech"),
  include.lowest = TRUE
)

# Comptage des occurrences dans chaque catégorie
speechiness_counts <- table(onlysongs$speechiness_category)

# Création de l'histogramme
barplot(
  speechiness_counts,
  main = "Distribution des catégories de 'speechiness'",
  xlab = "Catégories de speechiness",
  ylab = "Nombre de morceaux",
  col = "lightblue",
  border = "black"
)

In [None]:
# Création des catégories pour la variable 'speechiness'
onlysongs$speechiness_category <- cut(
  onlysongs$speechiness,
  breaks = c(0, 0.5, 1),
  labels = c("Vocal", "Instrumental"),
  include.lowest = TRUE
)

# Comptage des occurrences dans chaque catégorie
speechiness_counts <- table(onlysongs$speechiness_category)

# Création de l'histogramme
barplot(
  speechiness_counts,
  main = "Distribution des catégories de 'speechiness'",
  xlab = "Catégories de speechiness",
  ylab = "Nombre de morceaux",
  col = "lightblue",
  border = "black"
)

In [None]:
# Affichage des colonnes de onlysongs
colnames(onlysongs)

# Affichage de la matrice de corrélation des colonnes popularity, danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_s
correlation_matrix <- cor(onlysongs[, c("track_popularity", "danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_s")])
corrplot(correlation_matrix, method = "number")

In [None]:
# Encodage des artistes en variables numériques
onlysongs$artist_numeric <- as.numeric(as.factor(onlysongs$track_artist))

# Calcul de la corrélation entre l'artiste encodé et la popularité
correlation_artist_popularity <- cor(onlysongs$artist_numeric, onlysongs$track_popularity)

# Affichage de la corrélation
print(correlation_artist_popularity)

In [None]:
# Scatterplot pour visualiser la relation entre l'accousticness et l'energy
ggplot(onlysongs, aes(x = acousticness, y = energy)) +
  geom_point(aes(color = track_popularity), alpha = 0.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Relation entre l'acousticness et l'energy",
       x = "Acousticness",
       y = "Energy",
       color = "Popularité") +
  theme_minimal()

In [None]:
# Scatterplot pour visualiser la relation entre l'acousticness et la loudness
ggplot(onlysongs, aes(x = acousticness, y = loudness)) +
  geom_point(aes(color = track_popularity), alpha = 0.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Relation entre l'acousticness et la loudness",
       x = "Acousticness",
       y = "Loudness",
       color = "Popularité") +
  theme_minimal()

In [None]:
#Scatterplot pour visualiser la relation entre la loudness et l'energy
ggplot(onlysongs, aes(x = loudness, y = energy)) +
  geom_point(aes(color = track_popularity), alpha = 0.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Relation entre la loudness et l'energy",
       x = "Loudness",
       y = "Energy",
       color = "Popularité") +
  theme_minimal()

In [None]:
# Histogrammes de la popularité des chansons par genre, un histogramme par genre, en utilisant les données "song"
options(repr.plot.width=15, repr.plot.height=10)
par(mfrow=c(3, 2))
for (genre in levels(song$playlist_genre)) {
  hist(song$track_popularity[song$playlist_genre == genre], 
       main = paste("Popularité des chansons dans le genre", genre), 
       xlab = "Popularité", 
       ylab = "Nombre de chansons", 
       col = "lightblue", 
       border = "black")
}

In [None]:
# Histogrammes de la popularité des chansons par sous-genre, un histogramme par sous-genre, en utilisant les données "song"
options(repr.plot.width=15, repr.plot.height=10)
par(mfrow=c(3, 2))
for (subgenre in levels(song$playlist_subgenre)) {
  hist(song$track_popularity[song$playlist_subgenre == subgenre], 
       main = paste("Popularité des chansons dans le sous-genre", subgenre), 
       xlab = "Popularité", 
       ylab = "Nombre de chansons", 
       col = "lightblue", 
       border = "black")
}

In [None]:
# Extraction de l'année de sortie
onlysongs$release_year <- format(onlysongs$track_album_release_date, "%Y")

# Calcul de la popularité moyenne par année de sortie
average_popularity_by_year <- onlysongs %>%
  group_by(release_year) %>%
  summarise(mean_popularity = mean(track_popularity, na.rm = TRUE))

# Création du graphique
ggplot(average_popularity_by_year, aes(x = as.numeric(release_year), y = mean_popularity)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "Popularité moyenne des musiques par année de sortie",
       x = "Année de sortie",
       y = "Popularité moyenne") +
  theme_minimal()

In [None]:
# Comptage du nombre de musiques par genre et par année
songs_by_genre_year <- song %>%
    group_by(release_year = format(track_album_release_date, "%Y"), playlist_genre) %>%
    summarise(count = n(), .groups = "drop")

# Création du graphique
    ggplot(songs_by_genre_year, aes(x = as.numeric(release_year), y = count, color = playlist_genre)) +
        geom_line() +
        labs(title = "Nombre de musiques sorties par genre et par année",
             x = "Année de sortie",
             y = "Nombre de morceaux",
             color = "Genre") +
        theme_minimal()

In [None]:
# Calcul de la loudness moyenne par genre
average_loudness_by_genre <- song %>%
    group_by(playlist_genre) %>%
    summarise(mean_loudness = mean(loudness, na.rm = TRUE))

# Création du graphique
ggplot(average_loudness_by_genre, aes(x = reorder(playlist_genre, -mean_loudness), y = mean_loudness)) +
    geom_bar(stat = "identity", fill = "lightblue", color = "black") +
    labs(title = "Loudness moyenne par genre de musique",
             x = "Genre de musique",
             y = "Loudness moyenne") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

In [None]:
# Calcul des moyennes par genre pour chaque variable
average_values_by_genre <- song %>%
    group_by(playlist_genre) %>%
    summarise(
        danceability = mean(danceability, na.rm = TRUE),
        energy = mean(energy, na.rm = TRUE),
        loudness = mean(loudness, na.rm = TRUE),
        mode = mean(as.numeric(mode), na.rm = TRUE),
        speechiness = mean(speechiness, na.rm = TRUE),
        acousticness = mean(acousticness, na.rm = TRUE),
        instrumentalness = mean(instrumentalness, na.rm = TRUE),
        liveness = mean(liveness, na.rm = TRUE),
        valence = mean(valence, na.rm = TRUE),
        tempo = mean(tempo, na.rm = TRUE),
        duration_ms = mean(duration_s * 1000, na.rm = TRUE)
    )

# Affichage du tableau
print(average_values_by_genre)

In [None]:
# Diagramme en barres empliées de la variable "Key" en fonction du genre de la musique
ggplot(song, aes(x = playlist_genre, fill = key)) +
  geom_bar(position = "fill") +
  labs(title = "Répartition de la variable 'Key' par genre de musique",
       x = "Genre de musique",
       y = "Proportion",
       fill = "Key") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

In [None]:
#Diagramme en barre empillés de la variable "Mode" en fonction du genre de la musique
ggplot(song, aes(x = playlist_genre, fill = mode)) +
  geom_bar(position = "fill") +
  labs(title = "Répartition de la variable 'Mode' par genre de musique",
       x = "Genre de musique",
       y = "Proportion",
       fill = "Mode") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))