## Load data

In [None]:
import pandas as pd
import seaborn as sb
import sqlite3

import matplotlib.pyplot as plt

con = sqlite3.connect("../data/database.db")

album_df = pd.read_sql("SELECT * FROM Album", con=con)
genre_df = pd.read_sql("SELECT * FROM Genre", con=con)
artist_df = pd.read_sql("SELECT * FROM Artist", con=con)
track_df = pd.read_sql("SELECT * FROM Track", con=con)

artist_genre_df = pd.read_sql("SELECT * FROM Artist_Genre_Through", con=con)
album_artist_df = pd.read_sql("SELECT * FROM Album_Artist_Through", con=con)
track_artist_df = pd.read_sql("SELECT * FROM Track_Artist_Through", con=con)

# Close connection
con.close()

artist_df.sort_values(by=["popularity"], ascending=False, inplace=True)

track_df["duration_m"] = track_df["duration_ms"] / 60000  # Convert duration from miliseconds to minutes
album_df["release_year"] = album_df["release_date"].apply(lambda x: str(x).split("-")[0])

# Set Seaborn figure size
sb.set(rc={'figure.figsize': (14, 9)}) 

# Seaborn settings

In [None]:
# Create an array with the colors you want to use
colors = [
    "#1DB954", 
    "#1ED760", 
    "#20E95D",
    "#191414", 
]

# Set your custom color palette
sb.set_palette(sb.color_palette(colors))

# Charts

## Most frequent words in tracks' lyrics

In [None]:
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def calculate_frequencies(values):
    frequencies = {}

    for text in values:
        text = text.replace(",", "").replace("!", "").replace(".", "").lower()
        text = re.sub(r'\W+ ', '', text)

        for sentence in text.split("\n"):
            for word in sentence.split(" "):
                if len(word) > 2 and word not in stops:
                    if word in frequencies:
                        frequencies[word] += 1

                    else:
                        frequencies[word] = 1

    return frequencies

In [None]:
import re

from wordcloud import WordCloud
from matplotlib.pyplot import figure

figure(figsize=(30, 8))

frequencies = calculate_frequencies(track_df["lyrics"].values)
frequencies["i'm"] = 0

wordcloud = WordCloud(width=2000, height=2000, background_color="white", colormap=sb.color_palette("dark:#20E95D", as_cmap=True))
wordcloud.generate_from_frequencies(frequencies)

plt.imshow(wordcloud, interpolation="bilinear");
plt.axis("off");

plt.savefig('../out/charts/track_lyrics_wordcloud.png');

plt.show();

## Most frequent words in tracks' names

In [None]:
frequencies = calculate_frequencies(track_df["name"].values)
frequencies["remix"] = 0

wordcloud = WordCloud(width=2000, height=2000, background_color="white", colormap=sb.color_palette("dark:#20E95D", as_cmap=True))
wordcloud.generate_from_frequencies(frequencies)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

plt.savefig('../out/charts/track_names_wordcloud.png');

plt.show();

## Number of albums over the years

In [None]:
g = sb.countplot(
    x="release_year", 
    data=album_df.sort_values(by="release_year", ascending=False),
    palette=sb.color_palette("dark:#20E95D")
);

g.set_xticklabels(g.get_xticklabels(), rotation=90);

plt.xlabel("Release year");
plt.ylabel("Number of albums");

g.figure.savefig("../out/charts/album_over_years.png")

## Most common artist genres

In [None]:
artist_genres_df = artist_df.merge(artist_genre_df, left_on="id", right_on="artist_id") \
    .merge(genre_df, left_on="genre_id", right_on="id") \
    .rename(columns={"name_y": "genre_name"})

In [None]:
g = sb.countplot(
    x="genre_name", 
    data=artist_genres_df, 
    order=artist_genres_df["genre_name"].value_counts().iloc[:50].index,
    palette=sb.color_palette("dark:#20E95D")
)

g.set_xticklabels(g.get_xticklabels(), rotation=90);

plt.xlabel("Genre");
plt.ylabel("Number of artist");

g.figure.savefig("../out/charts/artist_genres.png")

## Audio features

In [None]:
fig, axs = plt.subplots(3, 3)

sb.histplot(data=track_df, x="acousticness", kde=True, ax=axs[0][0], hue="mode");
sb.histplot(data=track_df, x="danceability", kde=True, ax=axs[0][1], hue="mode");
sb.histplot(data=track_df, x="energy", kde=True, ax=axs[0][2], hue="mode");
sb.histplot(data=track_df, x="speechiness", kde=True, ax=axs[1][0], hue="mode");
sb.histplot(data=track_df, x="loudness", kde=True, ax=axs[1][2], hue="mode");
sb.histplot(data=track_df, x="tempo", kde=True, ax=axs[2][0], hue="mode");
sb.histplot(data=track_df, x="valence", kde=True, ax=axs[2][1], hue="mode");
sb.histplot(data=track_df, x="duration_m", kde=True, ax=axs[2][2], hue="mode");

plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

fig.savefig("../out/charts/audio_features.png")

## Artist popularity

In [None]:
num_tracks_df = track_artist_df.groupby("artist_id").size().reset_index(name="num_tracks")
artist_df = artist_df.merge(num_tracks_df, left_on="id", right_on="artist_id").drop(columns=["artist_id"])

num_albums_df = album_artist_df.groupby("artist_id").size().reset_index(name="num_albums")
artist_df = artist_df.merge(num_albums_df, left_on="id", right_on="artist_id").drop(columns=["artist_id"])

artist_df.drop(artist_df.loc[artist_df["name"] == "Various Artists"].index, inplace=True)

In [None]:

g = sb.scatterplot(
    x="num_tracks", 
    y="num_albums", 
    size="popularity", 
    hue="popularity",
    sizes=(20, 750),
    alpha=0.5,
    linewidth=0,
    data=artist_df,
    palette=sb.color_palette("dark:#20E95D", as_cmap=True)
);

plt.xlabel("Number of tracks");
plt.ylabel("Number of albums");

g.figure.savefig("../out/charts/artist_popularity.png")