In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("data/music_genre.csv", index_col=0)

df

## Data description

- instance_id - id of a song
- artist_name - the name of the artist
- track_name - the name of the song
- popularity - value between 0 and 100, with 100 being the most popular. The popularity is calculated by algorithm and is   based, in the most part, on the total number of plays the track has had and how recent those plays are.
- acousticness - a confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.
- danceability - a value between 0.0 and 1.0 of how suitable the track is for dancing
- duration_ms - duration of a track in milliseconds
- energy - a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity.
- instrumentalness - The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.
- key - the key of the track (C, C# ... B)
- liveness - Detects the presence of an audience in the recording. Form 0.0 to 1.0
- loudness - measure of how loud the track is. From -60 (silence) to 0 (max loudness without distortion)
- mode - mode of the track (Minor, Major)
- speechiness - detects the presence of spoken words in a track.
- tempo - tempo of the song in bpm
- obtained_date - date
- valence - A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track.
- music_genre - genre of the track (target)

more info -> https://developer.spotify.com/documentation/web-api/reference/get-audio-features



## Removing the missing values

we have so much rows that we are just going to drop the ones with missing values

in the artist_name they are denoted as 'empty_field'

In [None]:
df[df['artist_name'] == 'empty_field']

in the tempo they are denoted as '?'

In [None]:
df[df['tempo'] == '?']

In [None]:
df['artist_name'] = df['artist_name'].replace('empty_field', np.nan)
df['tempo'] = df['tempo'].replace('?', np.nan)

df.dropna(inplace=True)

some songs have the duration of -1 ms so lets use IterativeImputer to fill in the blanks

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

df['duration_ms'] = df['duration_ms'].replace(-1,np.nan)

imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42)

df['duration_ms'] = imputer.fit_transform(df[['duration_ms']])

df

Dropping the obtained_date

In [None]:
df.drop('obtained_date', inplace=True, axis=1)

df

and finally change the tempo into a float

In [None]:
df['tempo'] = df['tempo'].apply(lambda x: float(x))

df

## Summary of some columns

In [None]:
numerical = df.select_dtypes(exclude=object).columns
categorical = df.select_dtypes(include=object).columns

print(numerical)
print(categorical)

In [None]:
df[numerical].describe()

In [None]:
df[categorical].describe()

In [None]:
def df_proportions(name):
    return df[name].value_counts(normalize=True).sort_values(ascending=False)

df_proportions('artist_name')

In [None]:
df_proportions('track_name')

In [None]:
df_proportions('key')

In [None]:
df_proportions('music_genre')

In [None]:
numerical

## Data visualization

In [None]:
for col in numerical:
    plt.figure(figsize=(12,4))
    sns.boxplot(x='music_genre', y=col, data=df)
    plt.show()

In [None]:
for col in numerical:
    plt.figure(figsize=(12,4))
    sns.violinplot(x='music_genre', y=col, data=df)
    plt.show()

# Emotional Characteristics of Musical Keys

## C

- **C Major** – Innocently Happy
- **C Minor** – Innocently Sad, Love-Sick

## C♯ / D♭

- **C♯ Minor** – Despair, Wailing, Weeping
- **D♭ Major** – Grief, Depressive

## D

- **D Major** – Triumphant, Victorious War-Cries
- **D Minor** – Serious, Pious, Ruminating

## D♯ / E♭

- **D♯ Minor** – Deep Distress, Existential Angst
- **E♭ Major** – Cruel, Hard, Yet Full of Devotion

## E

- **E Major** – Quarrelsome, Boisterous, Incomplete Pleasure
- **E Minor** – Effeminate, Amorous, Restless

## F

- **F Major** – Furious, Quick-Tempered, Passing Regret
- **F Minor** – Obscure, Plaintive, Funereal

## F♯ / G♭

- **F♯ Major** – Conquering Difficulties, Sighs of Relief
- **F♯ Minor** – Gloomy, Passionate Resentment

## G

- **G Major** – Serious, Magnificent, Fantasy
- **G Minor** – Discontent, Uneasiness

## A♭

- **A♭ Major** – Death, Eternity, Judgement
- **A♭ Minor** – Grumbling, Moaning, Wailing

## A

- **A Major** – Joyful, Pastoral, Declaration of Love
- **A Minor** – Tender, Plaintive, Pious

## B♭

- **B♭ Major** – Joyful, Quaint, Cheerful
- **B♭ Minor** – Terrible, the Night, Mocking

## B

- **B Major** – Harsh, Strong, Wild, Rage
- **B Minor** – Solitary, Melancholic, Patience


In [None]:
genres = df['music_genre'].unique()

mode_palette = {
    'Minor': 'blue',
    'Major': 'orange'
}

keys_order = df['key'].unique()
keys_order.sort()

for genre in genres:
    subset = df[df['music_genre'] == genre]

    plt.figure(figsize=(12,4))
    sns.countplot(data=subset,x='key',hue='mode', palette=mode_palette, order=keys_order)
    plt.title(genre)
    plt.show()

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(data=df, x='key',hue='mode', palette=mode_palette, order=keys_order)
plt.title("Keys-mode hist")
plt.show()

Major mode dominates the histogram

In [None]:
df.hist(figsize=(12,12), bins=20)

More of the saddest songs than the happiest songs

In [None]:
df

## Encoding categorical (except artist_name adn track_name)

In [None]:
from sklearn.preprocessing import LabelEncoder

le_keys = LabelEncoder()

df['key'] = le_keys.fit_transform(df['key'])

df

In [None]:
mode_encoding = {
    'Minor': 0,
    'Major': 1
}

df['mode'] = df['mode'].map(mode_encoding)

df

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False).set_output(transform='pandas')

encoded_genres = ohe.fit_transform(df[['music_genre']])

df = pd.concat([df, encoded_genres], axis=1).drop(columns=['music_genre'])
df

## Heatmap

In [None]:
numerical = df.select_dtypes(exclude=object)

plt.figure(figsize=(15,15))
sns.heatmap(numerical.corr(), annot=True, cmap="Greens", fmt=".2f")

In [None]:
for col in numerical:
    for col1 in numerical:
        if col is col1:
            continue

        if abs(numerical[col].corr(numerical[col1])) >= 0.2:
            plt.figure(figsize=(15,8))
            sns.regplot(x=col,y=col1,data=numerical, line_kws={"color": "Black"})
            plt.title(f"{col} x {col1} Correlation: {round(numerical[col].corr(numerical[col1]),2)}")
            plt.show()