LSE Data Science Institute | ME204 (2024) | Final Project

# 🎼 Exploratory Data Analysis and Data Visualization

<span style="display: inline-block; padding: 0 10px; font-size: 1.15em;line-height: 1.5em; white-space: nowrap; border: 1px solid #E69F25; border-radius: .5em; color: #fcfcfc; background-color: #E69F25; vertical-align: middle;font-weight: 600 !important;">Data Analysis NOTEBOOK</span>

**DATE:** 28 July 2024

**AUTHOR:** [David Cho](https://github.com/LSE-ME204/me204-2024-project-Chodav?tab=readme-ov-file)

-----


# ⚙️ **Setup**


In [1]:
#!brew install cairo freetype libffi libjpeg libpng zlib
# For data manipulation
import pandas as pd
import numpy as np

# For data manipulation
from lets_plot import *
LetsPlot.setup_html()

## **Load the Data Frames**

In [2]:
albums = pd.read_csv('../data/clean/albums.csv')
artists = pd.read_csv('../data/clean/artists.csv')
tracks = pd.read_csv('../data/clean/tracks.csv')

# **💿 1. Album Genre Analysis**

In [3]:
# Calculate mean popularity for each artist
genre_order = albums.groupby('genre')['popularity'].median().sort_values().index.tolist()

# Create a categorical type with ordered artists
albums['genre'] = pd.Categorical(albums['genre'], categories=genre_order, ordered=True)

genre_popularity_violin_plot = (
    ggplot(data = albums, 
       mapping = aes(x = 'genre',
                     y = 'popularity',
                     fill = 'genre')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none') +
        # LABELS
    labs(title='"Indie Pop" has the highest median popularities amongst the four genres',
            subtitle="Violin plots of album popularties amongst genres") +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18),
        legend_position='none') +
    ggsize(1000, 500)
    
)

genre_popularity_violin_plot

In [4]:
genre_popularity_density_plot = (
    ggplot(
        data = albums, 
        mapping = aes(
            x = 'popularity', 
            color = 'genre', 
            )
        ) +
        geom_density() +
        xlim(0, 100) +
        scale_x_continuous(name="popularity", expand=[0., 0.05], breaks=np.arange(0, 100, 10)) +

        labs(title='All four music genres have the highest concentrations of popularity at 10',
             subtitle="Classical mussic has the narrowest distribution") + 
        
        theme(axis_text_x=element_text(size=15),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18)) +
        ggsize(1000, 500)
        
)
genre_popularity_density_plot

In [5]:
album_year_genre = albums.groupby(['genre','release_year'])['popularity'].median().reset_index()
album_year_genre = pd.DataFrame(album_year_genre)

album_year_genre_plot = (
    ggplot(album_year_genre, aes(x='release_year', y='popularity', color='genre')) +
    geom_line(size=1.5) +
    scale_color_discrete(name='Genre') +
    scale_x_continuous(breaks=np.arange(1930, 2030, 10)) +
    
    # LABELS
    labs(title='Album Popularity by Release Year & Genre',
         subtitle="No particular trend is apparent",
         x='release year',
         y = 'poplarity') +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18)) +
    ggsize(1000, 500)
    
)

album_year_genre_plot

  album_year_genre = albums.groupby(['genre','release_year'])['popularity'].median().reset_index()


# 👷‍♂️ **2. Track Production by Genre over Time**


In [6]:
track_release = albums.groupby(['genre','release_year'])['total_tracks'].sum().reset_index()
track_release = pd.DataFrame(track_release)

  track_release = albums.groupby(['genre','release_year'])['total_tracks'].sum().reset_index()


In [7]:
track_release_plot = (
    ggplot(track_release, aes(x='release_year', y='total_tracks', color='genre')) +
    geom_line(size=1.5) +
    scale_color_discrete(name='Genre') +
    scale_x_continuous(breaks=np.arange(1930, 2030, 10)) +
    
    # LABELS
    labs(title='Track Production Over Time by Genre',
         subtitle="Drastic increases in Classical and Gen Z genres in particular",
         x='release year',
         y = 'total tracks produced') +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18)) +
    ggsize(1000, 500)
    
)

track_release_plot

# 🎹 **3. Album Analysis by Artist**

In [8]:
# Calculate median popularity for each artist
artist_order = albums.groupby('artist')['popularity'].median().sort_values().index.tolist()

# Create a categorical type with ordered artists
albums['artist'] = pd.Categorical(albums['artist'], categories=artist_order, ordered=True)

artist_album_popularity_dist = (
    ggplot(data = albums, 
       mapping = aes(x = 'artist',
                     y = 'popularity',
                     fill = 'artist')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +

        # LABELS
        labs(title='Album Popularity by Artists ',
            subtitle="Laufey ranks second amongst artists in genres similar to her",
            x='artist',
            y = 'album popularity') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1800, 1000)
        
)

artist_album_popularity_dist

# 🎧 **4. Track Audio Features Analysis**


In [9]:
artist_tracks = tracks.groupby(['artist'])['popularity'].describe()

# Calculate mean popularity for each artist
artist_order = tracks.groupby('artist')['popularity'].median().sort_values().index.tolist()

# Create a categorical type with ordered artists
tracks['artist'] = pd.Categorical(tracks['artist'], categories=artist_order, ordered=True)

artist_track_popularity_dist = (
    ggplot(data = tracks, 
       mapping = aes(x = 'artist',
                     y = 'popularity',
                     fill = 'artist')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +

        # LABELS
        labs(title='Track Popularity by Artist',
            subtitle="Laufey ranks third in track popularity amongst her musical influences",
            x='artist',
            y = 'track popularity') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)
        
)

artist_track_popularity_dist

In [10]:
laufey_tracks = tracks[tracks['artist'] == 'Laufey']
laufey_tracks = laufey_tracks.drop(columns=['tempo', 'loudness'])
laufey_tracks

# Reshape the DataFrame to long format
laufey_tracks_long = laufey_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')

#tracks_long = tracks_long[tracks_long['artist'] == 'Laufey']
laufey_audio_feature_dist = (
    ggplot(data = laufey_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +

        # LABELS
        labs(title='Laufey Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
        
)
laufey_audio_feature_dist

In [11]:
taylor_swift_tracks = tracks[tracks['artist'] == 'Taylor Swift']
taylor_swift_tracks = taylor_swift_tracks.drop(columns=['tempo', 'loudness'])


# Reshape the DataFrame to long format
taylor_swift_tracks_long = taylor_swift_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')


taylor_swift_audio_feature_dist = (
    ggplot(data = taylor_swift_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +
        
        # LABELS
        labs(title='Taylor Swift Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
)

taylor_swift_audio_feature_dist

In [12]:
chet_baker_tracks = tracks[tracks['artist'] == 'Chet Baker']
chet_baker_tracks = chet_baker_tracks.drop(columns=['tempo', 'loudness'])


# Reshape the DataFrame to long format
chet_baker_tracks_long = chet_baker_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')


chet_baker_audio_features_dist = (
    ggplot(data = chet_baker_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +
        
        # LABELS
        labs(title='Chet Baker Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
)

chet_baker_audio_features_dist

In [13]:
jazz_tracks = tracks[tracks['artist'].isin(['Chet Baker', 'Ella Fitzgerald', 'Billie Holiday', 'Nora Jones']) ]
jazz_tracks = jazz_tracks.drop(columns=['tempo', 'loudness'])


# Reshape the DataFrame to long format
jazz_tracks_long = jazz_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')


jazz_tracks_audio_features_dist = (
    ggplot(data = jazz_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +
        
        # LABELS
        labs(title='Jazz Influences Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
)

jazz_tracks_audio_features_dist

In [14]:
classical_tracks = tracks[tracks['artist'].isin(['Frédéric Chopin', 'Ravel', 'Felix Mendelssohn', 'Sergei Rachmaninoff']) ]
classical_tracks = classical_tracks.drop(columns=['tempo', 'loudness'])


# Reshape the DataFrame to long format
classical_tracks_long = classical_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')


classical_audio_features_dist = (
    ggplot(data = classical_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +
        
        # LABELS
        labs(title='Classical Influences Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
)

classical_audio_features_dist

In [15]:
pop_tracks = tracks[tracks['artist'].isin(['Adele', 'Taylor Swift|']) ]
pop_tracks = pop_tracks.drop(columns=['tempo', 'loudness'])


# Reshape the DataFrame to long format
pop_tracks_long = pop_tracks.melt(id_vars=['artist'], value_vars=['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence'],
                          var_name='audio_feature', value_name='value')


pop_audio_features_dist = (
    ggplot(data = pop_tracks_long, 
       mapping = aes(x = 'audio_feature',
                     y = 'value',
                     fill = 'audio_feature')
       ) +
        geom_violin(alpha=0.5) +
        geom_boxplot(alpha = 0.5) +
        guides(fill = 'none')  +
        
        # LABELS
        labs(title='Pop Influences Audio Feature Distribution',
            x='audio feature',
            y = 'value') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1000, 600)  
)

pop_audio_features_dist

## **Mean & Standard Deviation Analysis**

In [16]:
# Calculate the mean of each feature for each artist
features = ['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
means = tracks.groupby('artist')[features].mean().reset_index()

  means = tracks.groupby('artist')[features].mean().reset_index()


In [17]:
# Melt the DataFrame to long format
features = ['danceability', 'acousticness', 'energy', 'instrumentalness', 'speechiness', 'liveness', 'valence']

means_long = pd.melt(means, id_vars=['artist'], 
                   value_vars=features,
                   var_name='feature', value_name='score')

In [18]:
laufey_chet_baker_comparision_chart = (
    ggplot(means_long[means_long['artist'].isin(['Laufey', 'Chet Baker'])]) 
    + geom_area(aes(x='feature', y='score', color = 'artist', fill='artist'),
                alpha = 0.3, 
                flat=True)                  
    + geom_point(aes(x='feature', y='score')) 
    + scale_x_discrete(labels=features) 
    + coord_polar() +
        # LABELS
        labs(title='Radar Chart of Audio Features',
                subtitle='Laufey vs. Chet Baker'
                ) +
        theme(legend_position='bottom',
                axis_text_x=element_text(size=15),
                axis_text_y=element_text(size=17),
                axis_title_x=element_text(size=20),
                axis_title_y=element_text(size=20),
                plot_title=element_text(size=19, face='bold'),
                plot_subtitle=element_text(size=18)) +
        ggsize(600, 600) 
)

laufey_chet_baker_comparision_chart

In [19]:
laufey_rachmaninoff_comparision_chart = (
    ggplot(means_long[means_long['artist'].isin(['Laufey', 'Sergei Rachmaninoff'])]) 
    + geom_area(aes(x='feature', y='score', color = 'artist', fill='artist'),
                alpha = 0.3, 
                flat=True)                  
    + geom_point(aes(x='feature', y='score')) 
    + scale_x_discrete(labels=features) 
    + coord_polar() +
        # LABELS
        labs(title='Radar Chart of Audio Features',
                subtitle='Laufey vs. Sergei Rachmaninoff'
                ) +
        theme(legend_position='bottom',
                axis_text_x=element_text(size=15),
                axis_text_y=element_text(size=17),
                axis_title_x=element_text(size=20),
                axis_title_y=element_text(size=20),
                plot_title=element_text(size=19, face='bold'),
                plot_subtitle=element_text(size=18)) +
        ggsize(600, 600) 
)

laufey_rachmaninoff_comparision_chart

In [20]:
artist_tracks_audio_feature_charts = (
    ggplot(means_long)
    + geom_area(aes(x='feature', y='score', color='artist', fill='artist'), alpha=0.3, flat=True)
    + geom_point(aes(x='feature', y='score', color='artist'))
    + scale_x_discrete(labels=features)
    + coord_polar()
    + facet_wrap('artist', scales='free')  # Facet by 'artist' column
    + theme(legend_position='bottom',
            legend_title=element_blank(),
            plot_title=element_text(size=19, face='bold'),
            legend_background=element_rect(fill='white', color='white'),
            axis_title_x=element_blank(),  
            axis_title_y=element_blank(), 
            panel_grid=element_blank())  
    + labs(title='Radar Chart of Audio Features by Artist') 
)

artist_tracks_audio_feature_charts

## **Standard Deviation Analysis**

In [21]:
# Calculate the mean of each feature for each artist
features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
tracks_std = tracks.groupby('artist')[features].std().reset_index()

# Melt the DataFrame to long format
tracks_std_long = tracks_std.melt(id_vars=['artist'], var_name='feature', value_name='value')

  tracks_std = tracks.groupby('artist')[features].std().reset_index()


In [22]:
audio_features_std_by_artist_plot = (ggplot(tracks_std_long, aes(x='feature', y='value', fill='artist'))
     + geom_bar(stat='identity', position='dodge')
     + facet_wrap('artist', scales='free_y')
     + labs(title='Bar Plots of Audio Features by Artist', x='Artist', y='Value')
     + # LABELS
        labs(title='Bar Plots of Audio Feature Standard Deviations',
             subtitle='By Artists',
            x='',
            y = '') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18)) +
        ggsize(1400, 800)
)

audio_features_std_by_artist_plot


In [26]:
audio_features_std_by_audio_feature = (
    ggplot(tracks_std_long, aes(x='artist', y='value', fill='artist'))
     + geom_bar(stat='identity', position='dodge', alpha=0.8) 
     + facet_wrap('feature', scales='free_y')
     + labs(title='Bar Plots of Audio Features by Artist', x='Artist', y='Value') + 
     # LABELS
        labs(title='Bar Plots of Audio Feature Standard Deviations',
             subtitle='By Feature',
            x='',
            y = '') +
        theme(axis_text_x=element_text(size=15),
            axis_text_y=element_text(size=17),
            axis_title_x=element_text(size=20),
            axis_title_y=element_text(size=20),
            plot_title=element_text(size=19, face='bold'),
            plot_subtitle=element_text(size=18),
            legend_position='none') +
        ggsize(1200, 700)

)

audio_features_std_by_audio_feature

## **Similarities (by Subtraction)**

In [24]:
# Extract Laufey's feature values
laufey_features = means[means['artist'] == 'Laufey'].iloc[0]

# Calculate absolute differences
abs_diff_df = means.copy()
for feature in ['danceability', 'energy', 'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']:
    abs_diff_df[feature] = (means[feature] - laufey_features[feature]).abs()

# Remove Laufey from the new DataFrame
abs_diff_df = abs_diff_df[abs_diff_df['artist'] != 'Laufey']

# Display the new DataFrame
abs_diff_df

# Melt the DataFrame to long format
long_df = abs_diff_df.melt(id_vars=['artist'], var_name='feature', value_name='value')

In [25]:
audio_features_differences_dist = (ggplot(long_df, aes(x='artist', y='value', fill='artist'))
     + geom_bar(stat='identity', position='dodge', alpha=0.8)  
     + facet_wrap('feature', scales='free_y')  
     
     # LABELS
     + labs(title='Bar Plots of Audio Features Absolute Differences from Laufey', 
            x='Artist', 
            y='Value',
            subtitle='By Audio Feature')
     + theme(axis_text_x=element_text(hjust=1),
             legend_position='none',
             plot_title=element_text(size=19, face='bold') 
             ) +
        ggsize(1200, 700) 
)

audio_features_differences_dist