# Supervised Group Project

## Data Stucture

### Data Types

In [23]:
import pandas as pd
import numpy as np
from IPython.core.display import display, Markdown

summary = pd.read_excel('Songs_2025.xlsx', sheet_name='legend')
summary = summary.rename(columns = {'year': 'Variable', 'year.1': 'Description'})

df = pd.read_excel('Songs_2025.xlsx', sheet_name='spotify songs')
summary['Data Type'] = df.dtypes.values[1:]

md_table = summary.to_markdown(index=False)
display(Markdown(md_table))
display(Markdown('---'))

  from IPython.core.display import display, Markdown


| Variable          | Description                                                                                                                                                                  | Data Type   |
|:------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------|
| track_name        | track name                                                                                                                                                                   | object      |
| track_popularity  | track popularity from 0 to 100                                                                                                                                               | int64       |
| album             | album the track appears on                                                                                                                                                   | object      |
| artist_name       | artist name                                                                                                                                                                  | object      |
| artist_genres     | list of the Genres the artist is associated with                                                                                                                             | object      |
| artist_popularity | artist popularity from 0 to 100, being the maximum                                                                                                                           | int64       |
| danceability      | a value of 0.0 is least danceable and 1.0 is most danceable                                                                                                                  | float64     |
| energy            | a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity, being 1.0 the maximum                                                               | float64     |
| key               | The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C,  1 = C♯/D♭,  2 = D  and so on                                              | float64     |
| loudness          | The overall loudness of a track in decibels (dB)                                                                                                                             | float64     |
| mode              | Modality (major or minor) of a track. Major = 1 and minor = 0                                                                                                                | float64     |
| speechiness       | Detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. | float64     |
| acousticness      | A confidence measure from 0.0 to 1.0 of whether the track is acoustic                                                                                                        | float64     |
| instrumentalness  | Predicts whether a track contains no vocals                                                                                                                                  | float64     |
| liveness          | Detects the presence of an audience in the recording                                                                                                                         | float64     |
| valence           | a measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track                                                                                            | float64     |
| tempo             | The overall estimated tempo of a track in beats per minute (BPM)                                                                                                             | float64     |
| duration_ms       | The duration of the track in milliseconds                                                                                                                                    | float64     |

---

In [27]:
df['dominant_genre_max'].value_counts()

dominant_genre_max
pop             944
hip             211
rap             209
rock            129
canadian         53
               ... 
afrofuturism      1
drill             1
bedroom           1
poptimism         1
chileno           1
Name: count, Length: 131, dtype: int64

### Handling Null Values

In [26]:
# Check for Missing Values
pd.DataFrame(df.isnull().sum()).rename(columns = {0:'Number_of_Nulls'})

Unnamed: 0,Number_of_Nulls
year,0
track_name,0
track_popularity,0
album,0
artist_name,0
artist_genres,0
artist_popularity,0
danceability,1
energy,1
key,1


In [23]:
# Where the missing values are coming from
missing_value = df[df.isnull().any(axis = 1)]
display(missing_value)
# display(Markdown(f'Considering there is only one track, {missing_value['track_name'].values[0]} with virtually all values missing and an average track popularity, it might make sense to just drop the value and continue with the analysis.'))
display(Markdown('Considering there is only one track with virtually all values missing and an average track popularity, it might make sense to just drop the value and continue with the analysis.'))

Unnamed: 0,year,track_name,track_popularity,album,artist_name,artist_genres,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
488,2004,These Words,68,Unwritten,Natasha Bedingfield,['dance pop'; 'pop'; 'post-teen pop'],64,,,,,,,,,,,,


Considering there is only one track with virtually all values missing and an average track popularity, it might make sense to just drop the value and continue with the analysis.

In [24]:
# Dropping the null row
df = df.drop(index = missing_value.index.item())
pd.DataFrame(df.isnull().sum()).rename(columns = {0:'Number_of_Nulls'})

Unnamed: 0,Number_of_Nulls
year,0
track_name,0
track_popularity,0
album,0
artist_name,0
artist_genres,0
artist_popularity,0
danceability,0
energy,0
key,0


### Handling Artists Genres

Two different approaches. First one takes the first value in the list of artist genres and assumes that it is the most important creating a new column called **dominant_genre_first**. 
  
`Example: ['pop', 'rock'] = pop`  
  
The second approach will split by every word and count the word that appears the most in the list. If there is an instance where it is only one genre but the genre is two words, it will take the second word and if it is only one word, will take just that word.  
  
`Example: ['pop', 'rock', 'country pop'] = pop`   
`Example: ['british soul'] = soul`  
`Example: ['country'] = country`  

In [32]:
# Dominant Genre based on first value in list
import re

def extract_first_genre(genre_str):
    """
    Cleans a list in string format by removing (], [, and ') and then splits by the delimiter ';' to return the
    first value in the list of genres. (Assumes the first genre in the list is most important)
    """
    cleaned_genre = re.sub(r"[\[\]']", "", genre_str).strip()  
    genre_list = cleaned_genre.split(';')
        
    return genre_list[0].strip() 


df['dominant_genre_first'] = df['artist_genres'].apply(extract_first_genre)

In [33]:
# Dominant value based on most occuring word
import re

def clean_and_extract_genre(genre_str):
    """
    Cleans a list in string format by removing (], [, and ') and then splits by the delimiter ';' to return a list of 
    words from the string. If the string only has one genre, it will take the second word as the dominant genre.
    If there are more words, then it will count the individual words and return the word that occurs the most.
    """
    if isinstance(genre_str, str):
        cleaned_genre = re.sub(r"[\[\]']", "", genre_str)
        genre_list = cleaned_genre.split(';')
        
        words = [word for genre in genre_list for word in genre.split()]
        
        #No words
        if not words:
            return 'Unknown'
        
        #One word
        if len(genre_list) == 1:
            return words[1] if len(words) > 1 else words[0]
        
        #Multiple words
        return max(words, key=words.count)

    return 'Unknown'

df['dominant_genre_max'] = df['artist_genres'].apply(clean_and_extract_genre)

In [35]:
display(df.head())
display(Markdown('---'))

Unnamed: 0,year,track_name,track_popularity,album,artist_name,artist_genres,artist_popularity,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,dominant_genre_max,dominant_genre_first
0,2000,7 Days,73,Born to Do It,Craig David,['british soul'],62,0.659,0.812,4.0,...,0.0,0.0487,0.23,0.0,0.0951,0.888,83.014,235133.0,soul,british soul
1,2000,Absolutely (Story of a Girl) - Radio Mix,69,The Madding Crowd,Nine Days,['pop rock'],49,0.481,0.94,7.0,...,1.0,0.066,0.000891,0.0,0.0939,0.66,96.493,189333.0,rock,pop rock
2,2000,Against All Odds (Take A Look at Me Now) - Mar...,61,Against All Odds (Take A Look at Me Now) EP,Mariah Carey,['dance pop'; 'pop'; 'urban contemporary'],76,0.542,0.498,1.0,...,1.0,0.0299,0.49,0.0,0.101,0.218,117.763,201933.0,pop,dance pop
3,2000,All The Small Things,84,Enema Of The State,blink-182,['alternative metal'; 'modern rock'; 'pop punk...,75,0.434,0.897,0.0,...,1.0,0.0488,0.0103,0.0,0.612,0.684,148.726,167067.0,punk,alternative metal
4,2000,Amazed,72,Lonely Grill,Lonestar,['contemporary country'; 'country'; 'country r...,57,0.561,0.543,1.0,...,1.0,0.0324,0.26,0.0,0.124,0.243,139.803,240867.0,country,contemporary country


---

### Unique Values

In [25]:
rows, columns = df.shape 
print(f'Rows: {rows}, Columns: {columns}')

Rows: 2299, Columns: 19


In [29]:
unique_values = pd.DataFrame(df.nunique())
unique_values = unique_values.rename(columns = {0: 'Unique Values'})
display(Markdown('---'))
display(unique_values)

---

Unnamed: 0,Unique Values
year,23
track_name,2121
track_popularity,71
album,1663
artist_name,891
artist_genres,698
artist_popularity,62
danceability,585
energy,641
key,12


In [91]:
df['artist_genres'].value_counts()

artist_genres
['pop']                                                              135
['dance pop'; 'pop']                                                 129
['canadian hip hop'; 'canadian pop'; 'hip hop'; 'pop rap'; 'rap']     32
['dance pop'; 'pop'; 'pop rap']                                       29
['barbadian pop'; 'pop'; 'urban contemporary']                        27
                                                                    ... 
['bedroom pop']                                                        1
['hip hop'; 'north carolina hip hop'; 'pop rap'; 'rap'; 'trap']        1
['melodic rap'; 'slap house']                                          1
['pop rap'; 'rhode island rap']                                        1
['nz pop']                                                             1
Name: count, Length: 698, dtype: int64

### Basic Statistics

In [94]:
df.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,2299.0,2011.003045,6.634528,2000.0,2005.0,2011.0,2017.0,2022.0
track_popularity,2299.0,70.944759,12.294047,0.0,66.0,72.0,79.0,100.0
artist_popularity,2299.0,72.873423,12.180506,29.0,65.0,74.0,82.0,100.0
danceability,2299.0,0.660116,0.141137,0.162,0.572,0.671,0.7595,0.975
energy,2299.0,0.693047,0.164838,0.0519,0.586,0.712,0.82,0.999
key,2299.0,5.277947,3.628494,0.0,2.0,5.0,8.0,11.0
loudness,2299.0,73.00696,15.643937,-56.0,63.0,74.0,83.0,132.0
mode,2299.0,0.598521,0.490304,0.0,0.0,1.0,1.0,1.0
speechiness,2299.0,0.097795,0.092445,0.0225,0.038,0.0568,0.1155,0.576
acousticness,2299.0,0.157689,0.203844,1.3e-05,0.0165,0.0689,0.223,0.978


In [92]:
df.head()

Unnamed: 0,year,track_name,track_popularity,album,artist_name,artist_genres,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,2000,7 Days,73,Born to Do It,Craig David,['british soul'],62,0.659,0.812,4.0,87.0,0.0,0.0487,0.23,0.0,0.0951,0.888,83.014,235133.0
1,2000,Absolutely (Story of a Girl) - Radio Mix,69,The Madding Crowd,Nine Days,['pop rock'],49,0.481,0.94,7.0,71.0,1.0,0.066,0.000891,0.0,0.0939,0.66,96.493,189333.0
2,2000,Against All Odds (Take A Look at Me Now) - Mar...,61,Against All Odds (Take A Look at Me Now) EP,Mariah Carey,['dance pop'; 'pop'; 'urban contemporary'],76,0.542,0.498,1.0,80.0,1.0,0.0299,0.49,0.0,0.101,0.218,117.763,201933.0
3,2000,All The Small Things,84,Enema Of The State,blink-182,['alternative metal'; 'modern rock'; 'pop punk...,75,0.434,0.897,0.0,69.0,1.0,0.0488,0.0103,0.0,0.612,0.684,148.726,167067.0
4,2000,Amazed,72,Lonely Grill,Lonestar,['contemporary country'; 'country'; 'country r...,57,0.561,0.543,1.0,92.0,1.0,0.0324,0.26,0.0,0.124,0.243,139.803,240867.0


EDA / Segmentation (14)

Predictive Model / Model Evaluation (19)

Report (22)