In [12]:
import pandas as pd

genres = pd.read_csv("./train.csv",usecols=["Genre"])["Genre"].astype("category")
genre_to_id = {g: i for i, g in enumerate(genres.cat.categories)}
id_to_genre = {i: g for g, i in genre_to_id.items()}



In [13]:
 print(id_to_genre)

{0: 'Country', 1: 'Electronic', 2: 'Folk', 3: 'Hip-Hop', 4: 'Indie', 5: 'Jazz', 6: 'Metal', 7: 'Pop', 8: 'R&B', 9: 'Rock'}


In [14]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

CHUNK_SIZE = 4096
OUT_DIR = "emb"
os.makedirs(OUT_DIR, exist_ok=True)

for i, chunk in enumerate(pd.read_csv(
        "/home/abigfatpoo/Downloads/train.csv",
        chunksize=CHUNK_SIZE,
        usecols=["Lyrics", "Genre"]
    )):

    # Lyrics → embeddings
    texts = chunk["Lyrics"].fillna("").tolist()
    X = model.encode(
        texts,
        batch_size=64,
        show_progress_bar=False
    )

    # Genre → IDs (using in-memory dict)
    y = chunk["Genre"].map(genre_to_id).to_numpy()

    # Save
    np.save(f"{OUT_DIR}/X_{i}.npy", X)
    np.save(f"{OUT_DIR}/y_{i}.npy", y)

    if i % 10 == 0:
        print(f"Saved chunk {i}")


Saved chunk 0
Saved chunk 10
Saved chunk 20
Saved chunk 30
Saved chunk 40
Saved chunk 50
Saved chunk 60
Saved chunk 70


In [11]:
print("hellO")

hellO


In [15]:
import os

Xs = sorted(f for f in os.listdir("emb") if f.startswith("X_"))
ys = sorted(f for f in os.listdir("emb") if f.startswith("y_"))

print(len(Xs), len(ys))
print(Xs[-5:])


71 71
['X_69.npy', 'X_7.npy', 'X_70.npy', 'X_8.npy', 'X_9.npy']


In [16]:
import pandas as pd

n = sum(1 for _ in open("./train.csv")) - 1
print("Total rows:", n)


Total rows: 11160892


In [17]:
import numpy as np
import os

seen_ids = set()

for f in os.listdir("emb"):
    if f.startswith("y_"):
        y = np.load(f"emb/{f}")
        seen_ids.update(set(y.tolist()))

seen_genres = {id_to_genre[i] for i in seen_ids}

print("Genres seen in 71 chunks:")
print(sorted(seen_genres))
print("Count:", len(seen_genres))


Genres seen in 71 chunks:
['Country', 'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Metal', 'Pop', 'R&B', 'Rock']
Count: 10


In [23]:
import xgboost as xgb
import numpy as np
import os

# sanity: load ONE chunk to initialize model
X = np.load("emb/X_0.npy")
y = np.load("emb/y_0.npy")

params = {
    "objective": "multi:softprob",
    "num_class": len(genre_to_id),
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

dtrain = xgb.DMatrix(X, label=y)

booster = xgb.train(
    params,
    dtrain,
    num_boost_round=30
)

print("Boosted rounds:", booster.num_boosted_rounds())


Boosted rounds: 30


In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score

# load encoder
model = SentenceTransformer("all-MiniLM-L6-v2")

# load test data
test_df = pd.read_csv(
    "./test.csv",
    usecols=["Lyrics", "Genre"]
)

# labels
y_true = test_df["Genre"].map(genre_to_id).to_numpy()

# lyrics
texts = test_df["Lyrics"].fillna("").tolist()
texts = [t[:2000] for t in texts]

# embed
X_test = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True
)

# predict
dtest = xgb.DMatrix(X_test)
probs = booster.predict(dtest)
y_pred = probs.argmax(axis=1)

# accuracy
acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)


Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Test Accuracy: 0.18815374921235034


In [25]:
import xgboost as xgb
import numpy as np
import os

params = {
    "objective": "multi:softprob",
    "num_class": len(genre_to_id),
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
    "nthread": -1
}

booster = None

Xs = sorted(f for f in os.listdir("emb") if f.startswith("X_"))

for f in Xs:
    idx = f.split("_")[1].split(".")[0]

    X = np.load(f"emb/X_{idx}.npy")
    y = np.load(f"emb/y_{idx}.npy")

    dtrain = xgb.DMatrix(X, label=y)

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=30,
        xgb_model=booster
    )

print("Training finished")
print("Boosted rounds:", booster.num_boosted_rounds())


Training finished
Boosted rounds: 2130


In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score

# load encoder
model = SentenceTransformer("all-MiniLM-L6-v2")

# load test data
test_df = pd.read_csv(
    "./test.csv",
    usecols=["Lyrics", "Genre"]
)

# labels
y_true = test_df["Genre"].map(genre_to_id).to_numpy()

# lyrics
texts = test_df["Lyrics"].fillna("").tolist()
texts = [t[:2000] for t in texts]

# embed
X_test = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True
)

# predict
dtest = xgb.DMatrix(X_test)
probs = booster.predict(dtest)
y_pred = probs.argmax(axis=1)

# accuracy
acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)


Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Test Accuracy: 0.2591052299936988


In [27]:
# genres in training
train_genres = set(genre_to_id.keys())

# genres in test
test_genres = set(test_df["Genre"].dropna().unique())

print("Train genres:", sorted(train_genres))
print("Test genres:", sorted(test_genres))

print("\nIn test but not in train:", test_genres - train_genres)
print("In train but not in test:", train_genres - test_genres)


Train genres: ['Country', 'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Metal', 'Pop', 'R&B', 'Rock']
Test genres: ['Country', 'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Metal', 'Pop', 'R&B', 'Rock']

In test but not in train: set()
In train but not in test: set()


In [28]:
for i in range(10):
    print("TRUE:", test_df.loc[i, "Genre"])
    print("PRED:", id_to_genre[y_pred[i]])
    print(test_df.loc[i, "Lyrics"][:200])
    print("-"*40)


TRUE: Hip-Hop
PRED: Pop
Most folks spend their days daydreaming of finding clues
My whole life I've been here at the train station shining shoes
I started when I was nine, on my own and taught myself
No complaints, I'm doing
----------------------------------------
TRUE: Indie
PRED: Rock
Take your cold hands and put them on my face
Sharpen your axe and your criminal ways
Let's go to town
and do what we did before
It's gonna hurt
but we don't feel pain no more
If you're alive can you s
----------------------------------------
TRUE: Metal
PRED: Pop
Are you ready it's time for war
We'll break down these fucking doors
Smash the windows, tear down the walls
We won't stop 'til it's all destroyed
Let it out, just let it show
We'll burn it down, and w
----------------------------------------
TRUE: Pop
PRED: Pop
You ask me why I change the color of my hair
(Yeah)
You ask me why I need thirty two pairs of shoes
(To wear)
You seem to ask me why I got a lot of things
It's just a chick thing, you o

In [29]:
import pandas as pd
import numpy as np
import os
import math
from collections import Counter
from sentence_transformers import SentenceTransformer

# ---------- config ----------
CHUNK_SIZE = 4096
OUT_DIR = "emb_v2"
os.makedirs(OUT_DIR, exist_ok=True)

model = SentenceTransformer("all-MiniLM-L6-v2")

def repetition_features(text):
    tokens = text.lower().split()
    n = len(tokens)
    if n == 0:
        return np.zeros(4)

    counts = Counter(tokens)
    freqs = np.array(list(counts.values()))

    uniq_ratio = len(counts) / n
    max_freq_ratio = freqs.max() / n

    lines = [l.strip() for l in text.lower().split("\n") if l.strip()]
    if lines:
        lc = Counter(lines)
        repeated_line_ratio = sum(c - 1 for c in lc.values() if c > 1) / len(lines)
    else:
        repeated_line_ratio = 0.0

    probs = freqs / n
    entropy = -np.sum(probs * np.log(probs + 1e-12))
    entropy /= math.log(len(freqs) + 1e-12)

    return np.array([uniq_ratio, max_freq_ratio, repeated_line_ratio, entropy])

    pd.read_csv(
        "./train.csv",
        chunksize=CHUNK_SIZE,
        usecols=["Lyrics", "Genre"]
    )
):
    texts = chunk["Lyrics"].fillna("").tolist()
    texts = [t[:2000] for t in texts]  # keep consistent

    word_cnt = np.array([len(t.split()) for t in texts])
    line_cnt = np.array([t.count("\n") + 1 for t in texts])

    length_feats = np.stack([
        np.log1p(char_cnt),
        np.log1p(word_cnt),
        np.log1p(line_cnt)
    ], axis=1)

    rep_feats = np.vstack([repetition_features(t) for t in texts])

    X_emb = model.encode(texts, batch_size=64, show_progress_bar=False)

    X = np.hstack([X_emb, length_feats, rep_feats])

    y = chunk["Genre"].map(genre_to_id).to_numpy()

    np.save(f"{OUT_DIR}/X_{i}.npy", X)
    np.save(f"{OUT_DIR}/y_{i}.npy", y)

    if i % 10 == 0:
        print(f"Saved chunk {i}")


Saved chunk 0
Saved chunk 10
Saved chunk 20
Saved chunk 30
Saved chunk 40
Saved chunk 50
Saved chunk 60
Saved chunk 70


In [30]:
import xgboost as xgb
import numpy as np
import os

# ---------- XGBoost params ----------
params = {
    "objective": "multi:softprob",
    "num_class": len(genre_to_id),
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
    "nthread": -1
}

booster = None

files = sorted(f for f in os.listdir("emb_v2") if f.startswith("X_"))

for f in files:
    idx = f.split("_")[1].split(".")[0]

    X = np.load(f"emb_v2/X_{idx}.npy")
    y = np.load(f"emb_v2/y_{idx}.npy")

    dtrain = xgb.DMatrix(X, label=y)

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=30,
        xgb_model=booster
    )

print("Training finished")
print("Boosted rounds:", booster.num_boosted_rounds())


Training finished
Boosted rounds: 2130


In [31]:
import pandas as pd
import numpy as np
import xgboost as xgb
import math
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score

def repetition_features(text):
    tokens = text.lower().split()
    n = len(tokens)
    if n == 0:
        return np.zeros(4)

    counts = Counter(tokens)
    freqs = np.array(list(counts.values()))

    uniq_ratio = len(counts) / n
    max_freq_ratio = freqs.max() / n

    lines = [l.strip() for l in text.lower().split("\n") if l.strip()]
    if lines:
        lc = Counter(lines)
        repeated_line_ratio = sum(c - 1 for c in lc.values() if c > 1) / len(lines)
    else:
        repeated_line_ratio = 0.0

    probs = freqs / n
    entropy = -np.sum(probs * np.log(probs + 1e-12))
    entropy /= math.log(len(freqs) + 1e-12)

    return np.array([uniq_ratio, max_freq_ratio, repeated_line_ratio, entropy])

model = SentenceTransformer("all-MiniLM-L6-v2")

test_df = pd.read_csv(
    "./test.csv",
    usecols=["Lyrics", "Genre"]
)

y_true = test_df["Genre"].map(genre_to_id).to_numpy()

texts = test_df["Lyrics"].fillna("").tolist()
texts = [t[:2000] for t in texts]

char_cnt = np.array([len(t) for t in texts])
word_cnt = np.array([len(t.split()) for t in texts])
line_cnt = np.array([t.count("\n") + 1 for t in texts])

length_feats = np.stack([
    np.log1p(char_cnt),
    np.log1p(word_cnt),
    np.log1p(line_cnt)
], axis=1)

rep_feats = np.vstack([repetition_features(t) for t in texts])

X_emb = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True
)

X_test = np.hstack([X_emb, length_feats, rep_feats])

dtest = xgb.DMatrix(X_test)
probs = booster.predict(dtest)
y_pred = probs.argmax(axis=1)

acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)


Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Test Accuracy: 0.23516068052930056


In [32]:
for i in range(5):
    print("TRUE:", test_df.loc[i, "Genre"])
    print("PRED:", id_to_genre[y_pred[i]])
    print("-"*30)


TRUE: Hip-Hop
PRED: Rock
------------------------------
TRUE: Indie
PRED: Pop
------------------------------
TRUE: Metal
PRED: Rock
------------------------------
TRUE: Pop
PRED: Pop
------------------------------
TRUE: Hip-Hop
PRED: Rock
------------------------------


In [36]:
import numpy as np
import os
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer



original_id_to_genre = {
    0: "Rock",
    1: "Indie",
    2: "Metal",
    3: "Pop",
    4: "R&B",
    5: "Hip-Hop",
    6: "Country",
    7: "Folk",
    8: "Jazz",
    9: "Electronic"
}

genre_map = {
    "Rock": "Rock",
    "Indie": "Rock",
    "Metal": "Rock",
    "Pop": "Pop",
    "R&B": "Pop",
    "Hip-Hop": "Hip-Hop",
    "Country": "Country",
    "Folk": "Country",
    "Jazz": "Jazz",
    "Electronic": "Electronic"
}

collapsed_genres = sorted(set(genre_map.values()))
collapsed_genre_to_id = {g: i for i, g in enumerate(collapsed_genres)}
collapsed_id_to_genre = {i: g for g, i in collapsed_genre_to_id.items()}

# numeric collapse map: original_id -> collapsed_id
collapse_id_map = {
    oid: collapsed_genre_to_id[genre_map[gname]]
    for oid, gname in original_id_to_genre.items()
}

print("Collapsed label map:", collapsed_genre_to_id)


params = {
    "objective": "multi:softprob",
    "num_class": len(collapsed_genre_to_id),
    "max_depth": 6,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
}

try:
    xgb.DeviceQuantileDMatrix
    params["tree_method"] = "gpu_hist"
    params["predictor"] = "gpu_predictor"
    print("Using GPU for XGBoost")
except Exception:
    params["tree_method"] = "hist"
    params["nthread"] = -1
    print("Using CPU for XGBoost")


booster = None

X_files = sorted(f for f in os.listdir("emb") if f.startswith("X_"))

for f in X_files:
    idx = f.split("_")[1].split(".")[0]

    X = np.load(f"emb/X_{idx}.npy")
    y_orig = np.load(f"emb/y_{idx}.npy")

    # collapse labels NUMERICALLY (no CSV involved)
    y = np.array([collapse_id_map[int(g)] for g in y_orig])

    dtrain = xgb.DMatrix(X, label=y)

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=30,
        xgb_model=booster
    )

print("Training finished. Boosted rounds:", booster.num_boosted_rounds())


test_df = pd.read_csv(
    "./test.csv",
    usecols=["Lyrics", "Genre"]
)

y_true = np.array([
    collapsed_genre_to_id[genre_map[g]]
    for g in test_df["Genre"]
])

model = SentenceTransformer("all-MiniLM-L6-v2")

texts = test_df["Lyrics"].fillna("").tolist()
texts = [t[:2000] for t in texts]

X_test = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True
)

dtest = xgb.DMatrix(X_test)
y_pred = booster.predict(dtest).argmax(axis=1)

acc = accuracy_score(y_true, y_pred)
print("Collapsed-genre accuracy:", acc)

print("\nFinal labels:")
for i in collapsed_id_to_genre:
    print(i, "→", collapsed_id_to_genre[i])


Collapsed label map: {'Country': 0, 'Electronic': 1, 'Hip-Hop': 2, 'Jazz': 3, 'Pop': 4, 'Rock': 5}
Using CPU for XGBoost
Training finished. Boosted rounds: 2130


Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Collapsed-genre accuracy: 0.09792060491493383

Final labels:
0 → Country
1 → Electronic
2 → Hip-Hop
3 → Jazz
4 → Pop
5 → Rock


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import torch
from sentence_transformers import SentenceTransformer

TRAIN_PATH = "./train.csv"
TEST_PATH  = "./test.csv"

CHUNK_SIZE = 4096
MAX_LEN = 2000
EMB_BATCH = 256
BOOST_ROUNDS_PER_CHUNK = 5

genre_map = {
    "Rock": "Rock",
    "Indie": "Rock",
    "Metal": "Rock",
    "Pop": "Pop",
    "R&B": "Pop",
    "Hip-Hop": "Hip-Hop",
    "Country": "Country",
    "Folk": "Country",
    "Jazz": "Jazz",
    "Electronic": "Electronic"
}

collapsed_genres = sorted(set(genre_map.values()))
genre_to_id = {g: i for i, g in enumerate(collapsed_genres)}
id_to_genre = {i: g for g, i in genre_to_id.items()}

print("Collapsed genres:", genre_to_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device for embeddings:", device)

model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device=device
)

params = {
    "objective": "multi:softprob",
    "num_class": len(genre_to_id),
    "tree_method": "hist",   
    "max_depth": 6,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
    "nthread": -1
}

booster = None
total_rows = 0

reader = pd.read_csv(
    TRAIN_PATH,
    chunksize=CHUNK_SIZE,
    usecols=["Lyrics", "Genre"]
)

for i, chunk in enumerate(reader):
    chunk = chunk.sample(frac=1.0, random_state=42).reset_index(drop=True)

    texts = chunk["Lyrics"].fillna("").astype(str).str.slice(0, MAX_LEN).tolist()
    y = np.array([genre_to_id[genre_map[g]] for g in chunk["Genre"]])

    if len(texts) == 0:
        continue

    X = model.encode(
        texts,
        batch_size=EMB_BATCH,
        show_progress_bar=False
    )

    dtrain = xgb.DMatrix(X, label=y)

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=BOOST_ROUNDS_PER_CHUNK,
        xgb_model=booster
    )

    total_rows += len(texts)

    if i % 100 == 0:
        print(f"Trained on {total_rows:,} rows")

print("\nTraining finished")
print("Total rows seen:", total_rows)
print("Boosted rounds:", booster.num_boosted_rounds())

test_df = pd.read_csv(
    TEST_PATH,
    usecols=["Lyrics", "Genre"]
)

test_texts = test_df["Lyrics"].fillna("").astype(str).str.slice(0, MAX_LEN).tolist()
y_test = np.array([genre_to_id[genre_map[g]] for g in test_df["Genre"]])

X_test = model.encode(
    test_texts,
    batch_size=EMB_BATCH,
    show_progress_bar=True
)

dtest = xgb.DMatrix(X_test)
y_pred = booster.predict(dtest).argmax(axis=1)

acc = (y_pred == y_test).mean()

print("\nCollapsed-genre accuracy:", acc)
print("\nLabel mapping:")
for i in id_to_genre:
    print(i, "→", id_to_genre[i])


Collapsed genres: {'Country': 0, 'Electronic': 1, 'Hip-Hop': 2, 'Jazz': 3, 'Pop': 4, 'Rock': 5}
Using device for embeddings: cuda
Trained on 4,096 rows

Training finished
Total rows seen: 290183
Boosted rounds: 355


Batches:   0%|          | 0/31 [00:00<?, ?it/s]


Collapsed-genre accuracy: 0.44473850031505985

Label mapping:
0 → Country
1 → Electronic
2 → Hip-Hop
3 → Jazz
4 → Pop
5 → Rock


In [6]:
!pip install pyarrow

Collecting pyarrow
  Using cached pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Using cached pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl (47.7 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-22.0.0


In [18]:
import json
import os
import numpy as np
import pandas as pd

os.makedirs("models", exist_ok=True)

model.save("models/st_transformer_model")

booster.save_model("models/genre_classifier.ubj")

metadata = {
    "genre_to_id": genre_to_id,
    "id_to_genre": {int(k): v for k, v in id_to_genre.items()},
    "genre_map": genre_map
}
with open("models/metadata.json", "w") as f:
    json.dump(metadata, f)

np.save("models/test_embeddings.npy", X_test)

test_df_clean = test_df.reset_index(drop=True)
cols_to_save = ['Lyrics', 'Genre'] 
for col in cols_to_save:
    if col in test_df_clean.columns:
        test_df_clean[col] = test_df_clean[col].astype(str)

test_df_clean[cols_to_save].to_feather("models/test_metadata.feather")