In [1]:
import os, pandas as pd, ast
from pathlib import Path
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

load_dotenv()
engine = create_engine(os.getenv("DATABASE_URL"))
print("Connected:", engine)

Connected: Engine(postgresql://postgres:***@localhost:5432/spotify)


In [None]:
# Load cleaned data
root = Path.cwd().parent
csv_path = root / "data" / "tracks_clean.csv"
df = pd.read_csv(csv_path, low_memory=False)
len(df), df.columns.tolist()[:10]

(28356,
 ['track_id',
  'track_name',
  'track_artist',
  'track_popularity',
  'track_album_id',
  'track_album_name',
  'track_album_release_date',
  'playlist_name',
  'playlist_id',
  'playlist_genre'])

In [None]:
# Create staging table
staging_sql = """
CREATE TABLE IF NOT EXISTS staging_tracks (
    track_id TEXT,
    track_name TEXT,
    track_artist TEXT,
    track_popularity INT,
    track_album_id TEXT,
    track_album_name TEXT,
    track_album_release_date TEXT, -- keep as TEXT in staging
    playlist_name TEXT,
    playlist_id TEXT,
    playlist_genre TEXT,
    playlist_subgenre TEXT,
    danceability NUMERIC,
    energy NUMERIC,
    key INT,
    loudness NUMERIC,
    mode INT,
    speechiness NUMERIC,
    acousticness NUMERIC,
    instrumentalness NUMERIC,
    liveness NUMERIC,
    valence NUMERIC,
    tempo NUMERIC,
    duration_ms INT
);
TRUNCATE staging_tracks;
"""

with engine.connect() as con:
    con.execute(text(staging_sql))
print("Staging table created and truncated.")

Staging table created and truncated.


In [4]:
# Load data into staging table
cols = [
    "track_id", "track_name", "track_artist", "track_popularity",
    "track_album_id", "track_album_name", "track_album_release_date",
    "playlist_name", "playlist_id", "playlist_genre", "playlist_subgenre",
    "danceability", "energy", "key", "loudness", "mode", "speechiness",
    "acousticness", "instrumentalness", "liveness", "valence", "tempo",
    "duration_ms"
]
missing = [c for c in cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in DataFrame: {missing}")

df[cols].to_sql("staging_tracks", engine, if_exists="append", index=False, chunksize=5000, method="multi")
print(f"Loaded {len(df)} rows into staging_tracks table.")

Loaded 28356 rows into staging_tracks table.


In [7]:
with engine.connect() as con:
    con.execute(text("CREATE TEMP TABLE tmp_artists(name TEXT PRIMARY KEY);"))
    artists_df = pd.read_sql(
        text("""
        SELECT DISTINCT track_artist AS name
        FROM staging_tracks
        WHERE track_artist IS NOT NULL AND track_artist <> '';
        """), con
    )
    artists_df.to_sql("tmp_artists", con, if_exists="append", index=False)
    con.execute(text("""
    INSERT INTO artists (name)
    SELECT name FROM tmp_artists
    ON CONFLICT (name) DO NOTHING;
    """))
    print(f"Inserted {len(artists_df)} unique artists into artists table.")

Inserted 10692 unique artists into artists table.
