In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed dataset
df = pd.read_csv("../data/processed/spotify_tracks_clean.csv")

# Show first few rows
df.head()

In [None]:
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nMissing values per column:")
print(df.isna().sum())

In [None]:
top_n = 20
genre_counts = df["track_genre"].value_counts().head(top_n)

plt.figure(figsize=(10,6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette="rocket")
plt.title(f"Top {top_n} Genres by Track Count")
plt.xlabel("Number of Tracks")
plt.ylabel("Genre")
plt.show()

In [None]:
# ---------------------------
# 4️⃣ Average Popularity Over Time
# ---------------------------
# Use the correct column: 'year'
plt.figure(figsize=(12,6))
df.groupby("year")["popularity"].mean().plot()
plt.title("Average Popularity of Tracks Over Time")
plt.xlabel("Year")
plt.ylabel("Popularity")
plt.show()


In [None]:
features = [
    "danceability", "energy", "valence", "tempo",
    "loudness", "speechiness", "acousticness",
    "instrumentalness", "liveness"
]

avg_features = df.groupby("track_genre")[features].mean().sort_values("danceability", ascending=False)
avg_features.head(10)


In [None]:
# Bar plot: Top 10 genres by danceability
top_feature_n = 10
top_danceability = avg_features.head(top_feature_n)

sns.barplot(
    x="danceability", 
    y=top_danceability.index, 
    data=top_danceability,
    palette="viridis"
)
plt.title(f"Top {top_feature_n} Genres by Average Danceability")
plt.xlabel("Average Danceability")
plt.ylabel("Genre")
plt.show()


In [None]:

# Example: Average danceability per year
feature = "danceability"
avg_per_year = df.groupby("year")[feature].mean().reset_index()

sns.lineplot(x="year", y=feature, data=avg_per_year)
plt.title(f"Average {feature.capitalize()} Over Time")
plt.xlabel("Year")
plt.ylabel(feature.capitalize())
plt.show()


In [None]:
# Example: Average energy per year
feature = "energy"
avg_per_year = df.groupby("year")[feature].mean().reset_index()

sns.lineplot(x="year", y=feature, data=avg_per_year)
plt.title(f"Average {feature.capitalize()} Over Time")
plt.xlabel("Year")
plt.ylabel(feature.capitalize())
plt.show()
