__COMP47970 Project__ • Anthony Salib 20341603

---

# Data Visualisation of the 'TikTok-ification' of Music

The goal of this assignment is to use a dataset obtained from the [Spotify API](https://developer.spotify.com/documentation/web-api/), and to investigate the factors that contribute to the 'TikTokification' of music.

This notebook covers the **Task of Visualising** the data we obtained. 

---

In [27]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import nbformat
pd.options.mode.chained_assignment = None  

In [28]:
df = pd.read_csv(Path('data')/'cleaned_tracks.csv')
output_dir = Path('vega_data')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (9611, 25)


Unnamed: 0,Track ID,Track name,Artist name,Popularity,Release year,Category,danceability,energy,key,loudness,...,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,duration
0,6DshyLFOY859M1ERk0h54h,New Release,Killing Time,22,1989,New Releases,0.322,0.714,4,-13.477,...,0.378,125.286,audio_features,6DshyLFOY859M1ERk0h54h,spotify:track:6DshyLFOY859M1ERk0h54h,https://api.spotify.com/v1/tracks/6DshyLFOY859...,https://api.spotify.com/v1/audio-analysis/6Dsh...,101960,4,1.7
1,4DZghpw50ZnO3ckfDuNkft,New Religion,The Heydaze,53,2017,New Releases,0.537,0.863,7,-4.46,...,0.484,97.02,audio_features,4DZghpw50ZnO3ckfDuNkft,spotify:track:4DZghpw50ZnO3ckfDuNkft,https://api.spotify.com/v1/tracks/4DZghpw50ZnO...,https://api.spotify.com/v1/audio-analysis/4DZg...,203587,4,3.39
2,0KbTsCQkdNGnkrea1vA4eG,Morning Has Broken (New Release) (with Diana K...,Art Garfunkel,43,2002,New Releases,0.253,0.206,2,-14.276,...,0.228,105.48,audio_features,0KbTsCQkdNGnkrea1vA4eG,spotify:track:0KbTsCQkdNGnkrea1vA4eG,https://api.spotify.com/v1/tracks/0KbTsCQkdNGn...,https://api.spotify.com/v1/audio-analysis/0KbT...,175227,3,2.92
3,0lmRKkc33g1N9fWW9nZPVc,"New Recording 12, Jan 3, 2020",Post Malone,46,2022,New Releases,0.477,0.162,9,-15.34,...,0.211,106.144,audio_features,0lmRKkc33g1N9fWW9nZPVc,spotify:track:0lmRKkc33g1N9fWW9nZPVc,https://api.spotify.com/v1/tracks/0lmRKkc33g1N...,https://api.spotify.com/v1/audio-analysis/0lmR...,92718,3,1.55
4,3eluHrktBbrs9qCnfRcJkm,Relieve Him of His Wand / Newt Releases the Th...,James Newton Howard,31,2016,New Releases,0.117,0.0799,5,-18.385,...,0.0341,85.353,audio_features,3eluHrktBbrs9qCnfRcJkm,spotify:track:3eluHrktBbrs9qCnfRcJkm,https://api.spotify.com/v1/tracks/3eluHrktBbrs...,https://api.spotify.com/v1/audio-analysis/3elu...,753202,4,12.55


In [29]:
df['Release year'] = pd.to_numeric(df['Release year'], errors='coerce') # convert release yr to numeric

recent_df = df[df['Release year'] >= 2010].copy()

if 'duration' in recent_df.columns:
    # if duration is already in minutes, use it
    recent_df['duration_min'] = recent_df['duration']
elif 'duration_ms' in recent_df.columns:
    # if in milliseconds, convert to minutes
    recent_df['duration_min'] = recent_df['duration_ms'] / 60000

print(f"Year range: {recent_df['Release year'].min()} to {recent_df['Release year'].max()}")

Year range: 2010 to 2024


In [30]:
# group by year and calculate statistics
yearly_stats = recent_df.groupby('Release year').agg(
    mean_duration=('duration_min', 'mean'),
    median_duration=('duration_min', 'median'),
    std_duration=('duration_min', 'std'),
    song_count=('Track ID', 'count'),
    mean_popularity=('Popularity', 'mean')
).reset_index()

yearly_stats.head()

yearly_stats_path_json = output_dir / 'yearly_stats.json'
yearly_stats.to_json(yearly_stats_path_json, orient='records', indent=2)
print(f"Saved yearly_stats data to: {yearly_stats_path_json}")

Saved yearly_stats data to: vega_data/yearly_stats.json


In [31]:
# Create the figure
fig = make_subplots(specs=[[{"secondary_y": False}]])

# Plot Average Song Duration
fig.add_trace(
    go.Scatter(
        x=yearly_stats['Release year'],
        y=yearly_stats['mean_duration'],
        mode='lines+markers',
        name='Avg Duration',
        marker=dict(
            size=yearly_stats['song_count'] / yearly_stats['song_count'].max() * 15 + 5,
            color='blue',  # Single color to reduce mental load
        ),
        line=dict(width=3),
        hovertemplate='<b>Year:</b> %{x}<br>' +
                      '<b>Avg Duration:</b> %{y:.2f} minutes<br>' +
                      '<b>Song Count:</b> %{text}',
        text=yearly_stats['song_count']
    )
)

# Trendline (linear regression)
z = np.polyfit(yearly_stats['Release year'], yearly_stats['mean_duration'], 1)
p = np.poly1d(z)
fig.add_trace(
    go.Scatter(
        x=yearly_stats['Release year'],
        y=p(yearly_stats['Release year']),
        mode='lines',
        name='Trend',
        line=dict(color='red', width=2, dash='dash'),
        hoverinfo='skip'
    )
)

# TikTok rise annotation + vertical line
fig.add_vline(
    x=2021, 
    line_width=2, 
    line_dash="dash", 
    line_color="gray"
)

fig.add_annotation(
    x=2021,
    y=max(yearly_stats['mean_duration']) + 0.5,
    text="TikTok becomes popular",
    showarrow=True,
    arrowhead=2,
    arrowsize=1,
    arrowwidth=1,
    yshift=10,
    font=dict(size=12, color="gray")
)

# Layout polishing
fig.update_layout(
    title={
        'text': 'The "Tiktokification" of Music: Shrinking Song Durations (2010-Present)',
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title='Release Year',
    yaxis_title='Average Song Duration (minutes)',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template='plotly_white',
    height=600
)

fig.update_xaxes(hoverformat='.0f')

fig.show()

In [32]:
recent_df['era'] = recent_df['Release year'].apply(lambda x: 'Pre-TikTok (2010-2018)' if x < 2019 else 'Post-TikTok (2019+)')
# Group by category and era to see which genres show the most change
category_era_stats = recent_df.groupby(['Category', 'era']).agg(
    mean_duration=('duration_min', 'mean'),
    song_count=('Track ID', 'count')
).reset_index()

# Calculate the change for each category (filtered to those with enough data)
category_pivot = category_era_stats.pivot(index='Category', columns='era', values='mean_duration')
category_pivot['song_count'] = category_era_stats.groupby('Category')['song_count'].sum()
category_pivot['duration_change'] = category_pivot['Post-TikTok (2019+)'] - category_pivot['Pre-TikTok (2010-2018)']
category_pivot['percent_change'] = (category_pivot['duration_change'] / category_pivot['Pre-TikTok (2010-2018)']) * 100

# --- Filter categories with enough songs
min_songs = 30
category_filtered = category_pivot[category_pivot['song_count'] >= min_songs].sort_values('percent_change')

# --- Highlight specific categories
highlight_categories = ['new releases', 'pop', 'Discover']
category_filtered_reset = category_filtered.reset_index()
category_filtered_reset['highlight'] = category_filtered_reset['Category'].apply(
    lambda x: 'Highlighted' if x.lower() in [cat.lower() for cat in highlight_categories] else 'Other'
)

# Prepare an explicit category order
category_order = category_filtered_reset['Category'].tolist()

category_change_path_json = output_dir / 'category_change_stats.json'
category_filtered_reset_renamed = category_filtered_reset.rename(columns={
    'Pre-TikTok (2010-2018)': 'Pre_TikTok_2010_2018',
    'Post-TikTok (2019+)': 'Post_TikTok_2019_plus'
})
category_filtered_reset_renamed.to_json(category_change_path_json, orient='records', indent=2)
print(f"Saved category change data to: {category_change_path_json}")


fig_cat = px.bar(
    category_filtered_reset,
    x='Category',
    y='percent_change',
    color='highlight',
    color_discrete_map={
        'Highlighted': '#FF4B4B',
        'Other': '#A0A0A0'
    },
    title='Song Duration Change by Category (Pre vs Post TikTok Era)',
    labels={'percent_change': 'Duration Change (%)', 'Category': 'Music Category'},
    hover_data={
        'Pre-TikTok (2010-2018)': True,
        'Post-TikTok (2019+)': True,
        'song_count': True,
        'highlight': False,
    },
    category_orders={'Category': category_order}  # 👈 THIS forces the ranked order
)

# --- Final Layout Tweaks
fig_cat.update_layout(
    xaxis_tickangle=-45,
    plot_bgcolor='white',
    xaxis_title='Music Category',
    yaxis_title='Duration Change (%)',
    showlegend=False
)

fig_cat.show()

Saved category change data to: vega_data/category_change_stats.json


# IDEA 2

In [33]:
# Make sure we have our era column defined
if 'era' not in recent_df.columns:
    recent_df['era'] = recent_df['Release year'].apply(
        lambda x: 'Pre-TikTok (2010-2018)' if x < 2019 else 'Post-TikTok (2019+)'
    )

# Check the distribution of songs between eras
era_counts = recent_df['era'].value_counts().reset_index()
era_counts.columns = ['Era', 'Count']
print(f"Distribution of songs by era:")
print(era_counts)

Distribution of songs by era:
                      Era  Count
0     Post-TikTok (2019+)   6202
1  Pre-TikTok (2010-2018)   2064


In [34]:
# Select the features we want to compare
features_to_compare = [
    'duration_min',     # Song duration
    'danceability',     # How suitable for dancing
    'energy',           # Intensity and activity
    'speechiness',      # Presence of spoken words
    'valence',          # Musical positiveness
    'tempo'             # Speed or pace of the track
]

# Create feature-friendly labels
feature_labels = {
    'duration_min': 'Duration (minutes)',
    'danceability': 'Danceability',
    'energy': 'Energy',
    'speechiness': 'Speechiness',
    'valence': 'Positivity',
    'tempo': 'Tempo (BPM)'
}

# Calculate percentage changes for each feature
feature_changes = []

for feature in features_to_compare:
    pre_mean = recent_df[recent_df['era'] == 'Pre-TikTok (2010-2018)'][feature].mean()
    post_mean = recent_df[recent_df['era'] == 'Post-TikTok (2019+)'][feature].mean()
    
    if pre_mean == 0:  # Avoid division by zero
        pct_change = 0
    else:
        pct_change = ((post_mean - pre_mean) / pre_mean) * 100
    
    # Special handling for duration - negative is "tiktokification"
    highlight = False
    if feature == 'duration_min' and pct_change < 0:
        highlight = True
    # For other features, positive change might be "tiktokification"
    elif feature in ['danceability', 'energy', 'speechiness'] and pct_change > 0:
        highlight = True
    
    feature_changes.append({
        'Feature': feature_labels[feature],
        'Percentage Change': pct_change,
        'Pre-TikTok Mean': pre_mean,
        'Post-TikTok Mean': post_mean,
        'Absolute Change': post_mean - pre_mean,
        'Highlight': highlight
    })

# Create a DataFrame from the results
change_df = pd.DataFrame(feature_changes)

# Sort by absolute percentage change
change_df = change_df.sort_values(by='Percentage Change', key=abs, ascending=False)

feature_change_path_json = output_dir / 'feature_change_stats.json'
change_df_renamed = change_df.rename(columns={
    'Pre-TikTok Mean': 'Pre_TikTok_Mean',
    'Post-TikTok Mean': 'Post_TikTok_Mean',
    'Absolute Change': 'Absolute_Change',
    'Percentage Change': 'Percentage_Change'
})
change_df_renamed.to_json(feature_change_path_json, orient='records', indent=2)
print(f"Saved feature change data to: {feature_change_path_json}")

# Create bar chart of percentage changes
fig_changes = px.bar(
    change_df,
    x='Feature',
    y='Percentage Change',
    color='Highlight',
    color_discrete_map={
        True: '#FF4B4B',   # Red for highlighted changes
        False: '#A0A0A0'   # Gray for other changes
    },
    title='Percentage Change in Music Features (Pre vs Post TikTok Era)',
    text='Percentage Change',
    hover_data=['Pre-TikTok Mean', 'Post-TikTok Mean', 'Absolute Change']
)

fig_changes.update_traces(
    texttemplate='%{text:.1f}%',
    textposition='outside'
)

fig_changes.update_layout(
    xaxis_title='Feature',
    yaxis_title='Percentage Change (%)',
    plot_bgcolor='white',
    width=900,
    height=500,
    showlegend=False
)

# Add a horizontal line at zero
fig_changes.add_hline(
    y=0,
    line_width=1,
    line_dash='dash',
    line_color='gray'
)

# Add explanatory annotation
fig_changes.add_annotation(
    x=0.5,
    y=1.1,
    xref="paper",
    yref="paper",
    text="Red bars highlight changes consistent with 'TikTokification'",
    showarrow=False,
    font=dict(size=14)
)

fig_changes.show()

Saved feature change data to: vega_data/feature_change_stats.json


In [35]:
# Perform t-tests to check for statistical significance
from scipy import stats

significance_results = []

for feature in features_to_compare:
    pre_data = recent_df[recent_df['era'] == 'Pre-TikTok (2010-2018)'][feature]
    post_data = recent_df[recent_df['era'] == 'Post-TikTok (2019+)'][feature]
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(pre_data, post_data, equal_var=False)
    
    significance_results.append({
        'Feature': feature_labels[feature],
        't-statistic': t_stat,
        'p-value': p_value,
        'Significant': p_value < 0.05
    })

# Create a DataFrame from the results
sig_df = pd.DataFrame(significance_results)
# Create a summary table for easy reference
summary_df = change_df.merge(sig_df, on='Feature')
summary_table = summary_df[['Feature', 'Pre-TikTok Mean', 'Post-TikTok Mean', 
                           'Absolute Change', 'Percentage Change', 'p-value', 'Significant']]

# Round for display
summary_table['Pre-TikTok Mean'] = summary_table['Pre-TikTok Mean'].round(3)
summary_table['Post-TikTok Mean'] = summary_table['Post-TikTok Mean'].round(3)
summary_table['Absolute Change'] = summary_table['Absolute Change'].round(3)
summary_table['Percentage Change'] = summary_table['Percentage Change'].round(1)
summary_table['p-value'] = summary_table['p-value'].apply(lambda x: f"{x:.6f}")

print("\nSummary of Changes:")
display(summary_table)


Summary of Changes:


Unnamed: 0,Feature,Pre-TikTok Mean,Post-TikTok Mean,Absolute Change,Percentage Change,p-value,Significant
0,Duration (minutes),3.611,2.892,-0.718,-19.9,0.0,True
1,Energy,0.574,0.516,-0.058,-10.1,0.0,True
2,Speechiness,0.105,0.114,0.009,8.2,0.011537,True
3,Positivity,0.457,0.431,-0.025,-5.6,0.000153,True
4,Danceability,0.58,0.595,0.015,2.6,0.002726,True
5,Tempo (BPM),118.504,118.821,0.317,0.3,0.68136,False


# IDEA 3

In [36]:
# Define the TikTok Score components
def calculate_tiktok_score(row):
    # Components for an ideal "TikTok friendly" song:
    
    # 1. Duration - shorter is better (inverse relationship)
    # Normalize to 0-1 scale where 1 is shortest
    max_duration = recent_df['duration_min'].max()
    min_duration = recent_df['duration_min'].min()
    duration_range = max_duration - min_duration
    duration_score = 1 - ((row['duration_min'] - min_duration) / duration_range if duration_range > 0 else 0)
    
    # 2. Danceability - higher is better for TikTok
    danceability_score = row['danceability']
    
    # 3. Energy - higher is better for TikTok
    energy_score = row['energy']
    
    # 4. Speechiness - medium-high is ideal (too high might be podcasts)
    # Peak at around 0.4-0.5 speechiness
    speechiness_score = 1 - abs(row['speechiness'] - 0.45) * 2
    speechiness_score = max(0, min(1, speechiness_score))  # Clamp to 0-1
    
    # 5. Valence - catchy songs often have higher valence
    valence_score = row['valence']
    
    # Weighted combination of all factors
    # You can adjust these weights based on what you believe contributes most
    # to a song's "TikTok friendliness"
    tiktok_score = (
        0.3 * duration_score +      # Duration - 30% weight
        0.25 * danceability_score +  # Danceability - 25% weight
        0.2 * energy_score +         # Energy - 20% weight
        0.15 * speechiness_score +   # Speechiness - 15% weight
        0.1 * valence_score          # Valence - 10% weight
    )
    
    # Scale to 0-100 for easier interpretation
    return tiktok_score * 100

# Calculate the TikTok Score for each song
recent_df['tiktok_score'] = recent_df.apply(calculate_tiktok_score, axis=1)

# Display summary statistics of the TikTok Score
print("TikTok Score Summary Statistics:")
print(recent_df['tiktok_score'].describe())

TikTok Score Summary Statistics:
count    8266.000000
mean       63.403237
std        11.650163
min        27.970309
25%        56.938615
50%        65.718982
75%        71.823318
max        92.330943
Name: tiktok_score, dtype: float64


In [37]:
# Calculate average values for each component by year
component_cols = ['duration_min', 'danceability', 'energy', 'speechiness', 'valence']
component_avgs = recent_df.groupby('Release year')[component_cols].mean().reset_index()

# Normalize duration for visualization (inverse it since shorter is better)
max_duration = component_avgs['duration_min'].max()
min_duration = component_avgs['duration_min'].min()
component_avgs['duration_norm'] = 1 - ((component_avgs['duration_min'] - min_duration) / (max_duration - min_duration))

value_vars = ['duration_norm', 'danceability', 'energy', 'speechiness', 'valence']
id_vars = ['Release year']

component_labels = {
    'duration_norm': 'Short Duration',
    'danceability': 'Danceability',
    'energy': 'Energy',
    'speechiness': 'Speechiness',
    'valence': 'Positivity'
}
component_avgs_renamed = component_avgs.rename(columns=component_labels)
value_vars_renamed = list(component_labels.values()) # Use the new names for melting

# fold the DataFrame
component_avgs_long = pd.melt(
    component_avgs_renamed,
    id_vars=id_vars,
    value_vars=value_vars_renamed,
    var_name='Component', # Name of the new column holding component names
    value_name='Value'     # Name of the new column holding the values
)

components_path_json = output_dir / 'component_evolution_stats.json'
component_avgs_long.to_json(components_path_json, orient='records', indent=2)
print(f"Saved component evolution data (long format) to: {components_path_json}")


# Create stacked area chart
fig_components = go.Figure()

# Add each component
fig_components.add_trace(go.Scatter(
    x=component_avgs['Release year'], y=component_avgs['duration_norm'],
    mode='lines', name='Short Duration',
    line=dict(width=0, color='rgb(255, 128, 128)'),
    stackgroup='one', fillcolor='rgb(255, 128, 128)'
))

fig_components.add_trace(go.Scatter(
    x=component_avgs['Release year'], y=component_avgs['danceability'],
    mode='lines', name='Danceability',
    line=dict(width=0, color='rgb(128, 255, 128)'),
    stackgroup='one', fillcolor='rgb(128, 255, 128)'
))

fig_components.add_trace(go.Scatter(
    x=component_avgs['Release year'], y=component_avgs['energy'],
    mode='lines', name='Energy',
    line=dict(width=0, color='rgb(128, 128, 255)'),
    stackgroup='one', fillcolor='rgb(128, 128, 255)'
))

fig_components.add_trace(go.Scatter(
    x=component_avgs['Release year'], y=component_avgs['speechiness'],
    mode='lines', name='Speechiness',
    line=dict(width=0, color='rgb(255, 255, 128)'),
    stackgroup='one', fillcolor='rgb(255, 255, 128)'
))

fig_components.add_trace(go.Scatter(
    x=component_avgs['Release year'], y=component_avgs['valence'],
    mode='lines', name='Positivity',
    line=dict(width=0, color='rgb(255, 128, 255)'),
    stackgroup='one', fillcolor='rgb(255, 128, 255)'
))

# Add TikTok era reference line
fig_components.add_vline(
    x=2019, 
    line_width=2, 
    line_dash="dash", 
    line_color="gray",
    annotation_text="TikTok's Rise",
    annotation_position="top right"
)

# Update layout
fig_components.update_layout(
    title={
        'text': 'Evolution of TikTok Score Components Over Time',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20}
    },
    xaxis=dict(
        title='Release Year',
        tickmode='linear',
        dtick=1,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title='Component Values (Normalized)',
        gridcolor='lightgray',
        zeroline=False
    ),
    plot_bgcolor='white',
    width=1000,
    height=600
)

fig_components.show()

Saved component evolution data (long format) to: vega_data/component_evolution_stats.json


# IDEA 4