In [None]:
import subprocess
import os
import pandas as pd
import zipfile

file_ids = [
    "1k5YrGPTzRTw1MsnLzkLFblTRplfdHkj_",
    "1kM96FBZb9KrmUqvrJZnCzIVdf-dbJCu8",
    "1kGXy1t1myzHzRIdFbRCAO2jYhoQXiZwV",
    "1kUosa6wg9T2M5aen8EwvyUotmywWqw-C",
    "1kPEPJQ4CqV-cCo6p8VVxhribkpOP4UJh",
    "1kNA7qmZtGByKg-v36jS91_ziJ6RzIuCM",
    "1Rmy10QBMkeKMbirJ3VqilrT81ix6T9uk",
    "1mU9YVr99C7E5hdys5zPQa5FnvplV8jWv"
]

for file_id in file_ids:
    url = f"https://drive.google.com/uc?id={file_id}"
    subprocess.run(["gdown", url])

# Define your local directory where the zip file is located.
local_path = ""  # Replace with your actual path
zip_file_path = os.path.join(local_path, "lyrics.zip")

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(local_path)


In [None]:
# paste the artist and song name here
# song_title = "song_title"
# artist = "artist"
song_title = 'Demons'
artist = 'Imagine Dragons'

In [None]:
import os
import pandas as pd

def read_csv_manual(filepath, delimiter="\t"):
    """
    Read a CSV file by manually splitting lines and creating a DataFrame.
    This function forces the DataFrame to use dtype=object, then converts
    all columns to string to avoid the "Cannot convert numpy.ndarray" error.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()

    # Find the first non-empty line to use as header
    header_line = None
    for line in lines:
        if line.strip():
            header_line = line.strip()
            break
    if header_line is None:
        raise ValueError("No header found in file: " + filepath)

    # Split the header using whitespace
    header = header_line.split(delimiter)
    ncols = len(header)

    data = []
    # Process remaining lines.
    for line in lines[1:]:
        if not line.strip():
            continue
        # Use maxsplit to ensure that any extra delimiter in the last field is preserved
        row = line.strip().split(delimiter, ncols - 1)
        if len(row) < ncols:
            row += [""] * (ncols - len(row))
        data.append(row)

    # Create DataFrame using dtype=object then convert all columns to string.
    df = pd.DataFrame(data, columns=header, dtype=object)
    return df.astype(str)

# Define the local directory where your CSV files are saved.
local_path = ""

# Read CSV files using the new custom reader.
df_genres = read_csv_manual(os.path.join(local_path, "id_genres.csv"), delimiter="\t")
df_info   = read_csv_manual(os.path.join(local_path, "id_information.csv"), delimiter="\t")
df_meta   = read_csv_manual(os.path.join(local_path, "id_metadata.csv"), delimiter="\t")
df_lang  = read_csv_manual(os.path.join(local_path, "id_lang.csv"), delimiter="\t")
df_tags   = read_csv_manual(os.path.join(local_path, "id_tags.csv"), delimiter="\t")

# Verify that the data loaded correctly.
# print("df_genres:")
# print(df_genres.head(), "\n")

# print("df_info:")
# print(df_info.head(), "\n")

# print("df_meta:")
# print(df_meta.head(), "\n")

# print("df_lang:")
# print(df_lang.head(), "\n")

# print("df_tags:")
# print(df_tags.head(), "\n")

# --- Merge the DataFrames ---
# the common key across files is "id".
df_merge = pd.merge(df_info, df_meta, on="id", how="left")
df_merge = pd.merge(df_merge, df_lang, on="id", how="left")
df_merge = pd.merge(df_merge, df_genres, on="id", how="left")
df_merge = pd.merge(df_merge, df_tags, on="id", how="left")

df_merge.to_csv("processed_data.csv", index=False, encoding="utf-8")
# print("Merged DataFrame columns:")
# print(df_merge.columns.tolist())

# --- Filter for the Shazam-Recognized Song ---
recognized_title = song_title  # Song title recognized by Shazam
recognized_artist = artist   # Artist recognized by Shazam

filtered = df_merge[
    (df_merge['song'].str.lower().str.contains(recognized_title.lower(), na=False)) &
    (df_merge['artist'].str.lower().str.contains(recognized_artist.lower(), na=False))
]

print(f"\nFound {len(filtered)} matching record(s) for the recognized song.")

# --- Load Lyrics from Local Files ---
# lyrics files are in a local folder "lyrics" under /content,
# with each file named as "<id>.txt".
lyrics_folder = os.path.join(local_path, "lyrics")

def load_lyrics(track_id):
    lyrics_file = os.path.join(lyrics_folder, f"{track_id}.txt")
    if os.path.exists(lyrics_file):
        with open(lyrics_file, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        return "Lyrics not available."

if not filtered.empty:
    result_df = filtered.copy()
    result_df['lyrics'] = result_df['id'].apply(load_lyrics)

    # --- Select and Rename Desired Columns ---
    # Desired columns: lyrics, spotifyid, popularity, danceability, energy,
    # key, mode, valence, tempo, genres, tags, lang, song, artist.
    cols_to_show = {
        "id": "spotifyid",   # from df_audio
        "popularity": "popularity",  # from df_audio
        "danceability": "danceability",  # from df_audio
        "energy": "energy",              # from df_audio
        "key": "key",                    # from df_audio
        "mode": "mode",                  # from df_audio
        "valence": "valence",            # from df_audio
        "tempo": "tempo",                # from df_audio
        "genres": "genres",              # from df_genres
        "tags": "tags",                  # from df_tags
        "lang": "lang",                  # from df_lang
        "song": "song",                  # from df_info
        "artist": "artist"               # from df_info
    }
    final_df = result_df[list(cols_to_show.keys())].rename(columns=cols_to_show)

    print("\nBasic Music Information for the recognized track:")
    print(final_df)

else:
    print("No matching record found in the Music4all database for the recognized song.")



Found 2 matching record(s) for the recognized song.

Basic Music Information for the recognized track:
              spotifyid popularity danceability              energy  key mode  \
49578  S4Oy1aU6jaz87NDi       27.0        0.515  0.7879999999999999  3.0  1.0   
83818  lZDk11KaRskQqhWf       70.0        0.505                0.71  3.0  1.0   

      valence              tempo            genres  \
49578   0.276            102.002  remix,indie rock   
83818   0.428  89.93799999999997   rock,indie rock   

                                                    tags lang  \
49578                 remix,indie,indie rock,alternative   en   
83818  indie,rock,indie rock,alternative,imagine dragons   en   

                                 song           artist  
49578  Demons (Imagine Dragons Remix)  Imagine Dragons  
83818                          Demons  Imagine Dragons  


In [None]:
# Further refine the filtered DataFrame to select the most accurate match.
# We assume that the best match is the one where the song column (after stripping and lowercasing)
# exactly equals the recognized title, e.g., "believer" (ignoring any remix annotations).

# Create a new column with normalized song names.
filtered = filtered.copy()  # ensure you're working with an explicit copy
filtered.loc[:, "song_normalized"] = filtered["song"].str.strip().str.lower()


# Normalize the recognized title.
recognized_title_norm = recognized_title.strip().lower()

# Filter for exact matches.
exact_matches = filtered[filtered["song_normalized"] == recognized_title_norm]

if not exact_matches.empty:
    # If there is an exact match, choose the first one.
    best_match = exact_matches.iloc[0]
else:
    # Otherwise, as a fallback, choose the record with the highest popularity.
    best_match = filtered.sort_values(by="popularity", ascending=False).iloc[0]

# Drop the helper column before displaying.
best_match = best_match.drop("song_normalized")
song_id = best_match["id"]
spotify_track_id = best_match["spotify_id"]

print("\nMost Accurate Match:")
print(best_match)



Most Accurate Match:
id                                               lZDk11KaRskQqhWf
artist                                            Imagine Dragons
song                                                       Demons
album_name                                          Night Visions
spotify_id                                 3LlAyCYU26dvFZBDUIMb7a
popularity                                                   70.0
release                                                      2012
danceability                                                0.505
energy                                                       0.71
key                                                           3.0
mode                                                          1.0
valence                                                     0.428
tempo                                           89.93799999999997
duration_ms                                                175200
lang                                                  

In [None]:
import requests

def load_lyrics_by_id(track_id):
    """Return the content of the lyrics file that corresponds to track_id from the remote URL."""
    url = f"https://test.ednovas.xyz/lyrics/{track_id}.txt"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return "Lyrics not available."

# Example: Load lyrics for a song with ID "zt91HOTj69exVeyB"
lyrics_content = load_lyrics_by_id(song_id)
print(lyrics_content)

When the days are cold and the cards all fold
And the saints we see are all made of gold
When your dreams all fail and the ones we hail
Are the worst of all and the blood's run stale

I want to hide the truth, I want to shelter you
But with the beast inside, there's nowhere we can hide
No matter what we breed, we still are made of greed
This is my kingdom come, this is my kingdom come

When you feel my heat, look into my eyes
It's where my demons hide, it's where my demons hide
Don't get too close, it's dark inside
It's where my demons hide, it's where my demons hide

At the curtain's call is the last of all
When the lights fade out, all the sinners crawl
So they dug your grave and the masquerade
Will come calling out at the mess you made

Don't want to let you down, but I am hell bound
Though this is all for you I don't want to hide the truth
No matter what we breed, we still are made of greed
This is my kingdom come, this is my kingdom come

When you feel my heat, look into my eyes
I

In [None]:
# !pip install gdown
import gdown
# Convert the shared link to a direct download link by using the file ID.
url = "https://drive.google.com/uc?id=1Br4GL_RezyXB1sbox877injTT1gX4eqJ"
output = "lyrics.txt"

# Download the file
gdown.download(url, output, quiet=False)

# Read the file's content into the variable 'lyrics'
with open(output, 'r', encoding='utf-8') as file:
    lyrics = file.read()

# Print the content of the file
print(lyrics)

Downloading...
From: https://drive.google.com/uc?id=1Br4GL_RezyXB1sbox877injTT1gX4eqJ
To: /content/lyrics.txt
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.53k/1.53k [00:00<00:00, 3.04MB/s]

When the days are cold and the cards all fold
And the saints we see are all made of gold
When your dreams all fail and the ones we hail
Are the worst of all and the blood's run stale

I want to hide the truth, I want to shelter you
But with the beast inside, there's nowhere we can hide
No matter what we breed, we still are made of greed
This is my kingdom come, this is my kingdom come

When you feel my heat, look into my eyes
It's where my demons hide, it's where my demons hide
Don't get too close, it's dark inside
It's where my demons hide, it's where my demons hide

At the curtain's call is the last of all
When the lights fade out, all the sinners crawl
So they dug your grave and the masquerade
Will come calling out at the mess you made

Don't want to let you down, but I am hell bound
Though this is all for you I don't want to hide the truth
No matter what we breed, we still are made of greed
This is my kingdom come, this is my kingdom come

When you feel my heat, look into my eyes
I




In [None]:
# !pip install keybert
import re
import nltk
import requests
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score,hamming_loss,precision_score,recall_score
from transformers import pipeline as hf_pipeline
import numpy as np
from keybert import KeyBERT

# Download NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Use KeyBERT for keyword extraction
kw_model = KeyBERT()

def preprocess_lyrics(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Sample training data
df_sample = df_merge.dropna(subset=["tags"]).sample(n=100, random_state=42)
df_sample["lyrics"] = df_sample["id"].apply(load_lyrics_by_id)
df_sample = df_sample.dropna(subset=["lyrics"])

df_sample["tags_list"] = df_sample["tags"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
df_sample = df_sample[df_sample["tags_list"].apply(len) > 0]

# Preprocesses lyrics
df_sample["clean_lyrics"] = df_sample["lyrics"].apply(preprocess_lyrics)

X = df_sample["clean_lyrics"]
y = df_sample["tags_list"]
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y)

# Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", MultiOutputClassifier(RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)))
])

pipeline.fit(X, y_bin)

# Keyword extraction using KeyBERT
def extract_keywords(lyrics_text, top_n=5):
    keywords = kw_model.extract_keywords(lyrics_text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n)
    normalized_keywords = {" ".join(sorted(set(kw[0].split()))) for kw in keywords}  # Normalize and remove true duplicates
    return list(normalized_keywords)[:top_n]

# Predict tags based on lyrics
def predict_tags_soft(lyrics_text, threshold=0.1):
    tfidf_vec = pipeline.named_steps['tfidf'].transform([lyrics_text])
    probas = pipeline.named_steps['clf'].predict_proba(tfidf_vec)

    final_tags = []
    for i, class_proba in enumerate(probas):
        if class_proba[0][1] >= threshold:
            final_tags.append(mlb.classes_[i])
    return final_tags

# Initialize sentiment analysis pipeline using a pre-trained model
sentiment_pipeline = hf_pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# def predict_sentiment(lyrics_text):
#     result = sentiment_pipeline(lyrics_text[:512])[0]
#     return result['label'], result['score']

def predict_sentiment(text, max_chunks=3):
    chunks = [text[i:i+512] for i in range(0, len(text), 512)]
    chunks = chunks[:max_chunks]
    results = sentiment_pipeline(chunks)
    label_counts = Counter([r["label"] for r in results])
    majority = label_counts.most_common(1)[0][0]
    avg_score = np.mean([r["score"] for r in results if r["label"] == majority])
    return majority, avg_score


# Example predictions
lyrics_clean = preprocess_lyrics(lyrics)
predicted_tags = predict_tags_soft(lyrics_clean, threshold=0.1)
sentiment_label, sentiment_score = predict_sentiment(lyrics)
keywords = extract_keywords(lyrics_clean, top_n=5)

# Print prediction results
print("\n Predicted Tags:", predicted_tags)
print(f" Predicted Mood: {sentiment_label} ({sentiment_score:.2f})")
print(f" Extracted Keywords: {keywords}")

# After training, make predictions
y_pred_bin = pipeline.predict(X)

# Model evaluation
f1_micro = f1_score(y_bin, y_pred_bin, average="micro")
hamming = hamming_loss(y_bin, y_pred_bin)
precision_micro = precision_score(y_bin, y_pred_bin, average="micro")
recall_micro = recall_score(y_bin, y_pred_bin, average="micro")

# Print evaluation results
print(f"\n Micro F1 Score: {f1_micro:.4f}")
print(f" Hamming Loss: {hamming:.4f}")
print(f" Precision Micro: {precision_micro:.4f}")
print(f" Recall Micro: {recall_micro:.4f}")


# # save model
# from google.colab import drive
# drive.mount('/content/drive')

# model_save_path = "/content/drive/MyDrive/music4all/lyric_classifier_model.pkl"

# import joblib
# joblib.dump({"pipeline": pipeline, "mlb": mlb}, model_save_path)
# print(f"[INFO] Model saved to {model_save_path}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use cpu



 Predicted Tags: ['pop', 'rock']
 Predicted Mood: NEGATIVE (0.98)
 Extracted Keywords: ['dreams fail', 'fold saints', 'cards cold', 'greed kingdom', 'dreams gold']

 Micro F1 Score: 0.9373
 Hamming Loss: 0.0023
 Precision Micro: 1.0000
 Recall Micro: 0.8821
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] Model saved to /content/drive/MyDrive/music4all/lyric_classifier_model.pkl


In [None]:
# Extraction of Common Utility Functions
import re
import requests
from collections import Counter
from keybert import KeyBERT
from transformers import pipeline as hf_pipeline
import numpy as np
import os
import gdown
import joblib

kw_model = KeyBERT()
sentiment_pipeline = hf_pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return " ".join([w for w in text.split() if len(w) > 2])

def extract_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n)
    return list({" ".join(sorted(set(k[0].split()))) for k in keywords})[:top_n]

def predict_sentiment(text, max_chunks=1):
    chunks = [text[i:i+512] for i in range(0, len(text), 512)][:max_chunks]
    results = sentiment_pipeline(chunks)
    label_counts = Counter([r["label"] for r in results])
    majority = label_counts.most_common(1)[0][0]
    avg_score = np.mean([r["score"] for r in results if r["label"] == majority])
    return majority, avg_score

def predict_tags(text, pipeline, mlb, threshold=0.1):
    tfidf_vec = pipeline.named_steps['tfidf'].transform([clean_text(text)])
    probas = pipeline.named_steps['clf'].predict_proba(tfidf_vec)
    return [mlb.classes_[i] for i, p in enumerate(probas) if p[0][1] >= threshold]

def generate_image(prompt, out_path="output.png"):
    from openai import OpenAI
    import openai
    import requests
    openai.api_key = "open_ai_key"  # Replace with your OpenAI API key
    client = OpenAI(api_key=openai.api_key)
    response = client.images.generate(
        model="dall-e-3",
        prompt=prompt,
        n=1,
        size="1024x1024"
    )
    img_url = response.data[0].url
    img_data = requests.get(img_url).content
    with open(out_path, 'wb') as f:
        f.write(img_data)
    print(f"[INFO] Image saved to {out_path}")

def load_model(file_id, out_path="lyric_classifier_model.pkl"):
    if not os.path.exists(out_path):
        print(f"[INFO] Downloading model from Google Drive...")
        gdown.download(id=file_id, output=out_path, quiet=False, use_cookies=True)
    else:
        print(f"[INFO] Model already exists at {out_path}, skipping download.")

    model = joblib.load(out_path)
    print(f"[INFO] Model loaded from {out_path}")
    return model["pipeline"], model["mlb"]

Device set to use cpu


In [None]:
import os
import zipfile
import random

# Generate Images by Randomly Selecting 10 Lyric Files from ZIP for Model Validation
def process_zip(zip_path, extract_dir, pipeline, mlb, top_n=10):
    if not os.path.exists(extract_dir):
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extract_dir)

    files = [f for f in os.listdir(extract_dir) if f.endswith(".txt")]
    selected = random.sample(files, min(top_n, len(files)))

    for fname in selected:
        with open(os.path.join(extract_dir, fname), "r", encoding="utf-8") as f:
            text = f.read()

        cleaned = clean_text(text)
        tags = predict_tags(text, pipeline, mlb)
        keywords = extract_keywords(cleaned)
        sentiment, score = predict_sentiment(text)

        prompt = ", ".join(sorted(set(tags + keywords + [sentiment.lower()])))
        out_path = f"./outputs/{fname.replace('.txt', '_out.png')}"
        os.makedirs("./outputs", exist_ok=True)
        print(f"\nðŸŽµ {fname}\nTags: {tags}\nSentiment: {sentiment} ({score:.2f})\nPrompt: {prompt}")
        generate_image(prompt, out_path)

model_file_id = "1mU9YVr99C7E5hdys5zPQa5FnvplV8jWv"
pipeline, mlb = load_model(model_file_id)
process_zip("lyrics.zip", "lyrics", pipeline, mlb)

[INFO] Model already exists at lyric_classifier_model.pkl, skipping download.
[INFO] Model loaded from lyric_classifier_model.pkl

ðŸŽµ 114hOqbAT6B12wCa.txt
Tags: []
Sentiment: NEGATIVE (0.98)
Prompt: ink shut, ink taste, love room, negative, room walk, taste wanted
[INFO] Image saved to ./outputs/114hOqbAT6B12wCa_out.png

ðŸŽµ usFsg51cRW8XGulk.txt
Tags: ['pop', 'rock', 'spanish']
Sentiment: NEGATIVE (0.99)
Prompt: amor que, enamora que, eres motivo, estamos solas, negative, pop, que solas, rock, spanish
[INFO] Image saved to ./outputs/usFsg51cRW8XGulk_out.png

ðŸŽµ vtNQ5gvkIEepbiYp.txt
Tags: ['pop', 'rock']
Sentiment: NEGATIVE (0.98)
Prompt: care forever, common regrets, feeling loves, knew loves, negative, pop, regrets, rock
[INFO] Image saved to ./outputs/vtNQ5gvkIEepbiYp_out.png

ðŸŽµ Yvp0IIDqMZgU4lDm.txt
Tags: ['female vocalists', 'pop', 'rock']
Sentiment: NEGATIVE (0.99)
Prompt: aint hangover, cause sober, female vocalists, ive sober, negative, pop, rock, sober, sober youve
[INFO

In [None]:
import requests

# Generate Images Using Lyrics Fetched via API
def get_lyrics(song_title, artist, api_key):
    url = "https://api.musixmatch.com/ws/1.1/matcher.lyrics.get"
    params = {
        "apikey": api_key,
        "q_track": song_title,
        "q_artist": artist
    }
    res = requests.get(url, params=params)
    try:
        body = res.json()["message"]["body"]["lyrics"]["lyrics_body"]
        return body.split("...")[0].strip()
    except:
        return None

def process_song(song_title, artist, pipeline, mlb, api_key):
    lyrics = get_lyrics(song_title, artist, api_key)
    if not lyrics:
        print("[ERROR] Lyrics not found.")
        return

    sentiment, score = predict_sentiment(lyrics)
    keywords = extract_keywords(clean_text(lyrics))
    tags = predict_tags(lyrics, pipeline, mlb)
    prompt = ", ".join(sorted(set(tags + keywords + [sentiment.lower()])))
    print(f"\nðŸŽµ {song_title} - {artist}\nTags: {tags}\nSentiment: {sentiment} ({score:.2f})\nPrompt: {prompt}")

    out_path = f"./outputs/{song_title.lower().replace(' ', '_')}_output.png"
    os.makedirs("./outputs", exist_ok=True)
    generate_image(prompt, out_path)

model_file_id = "1mU9YVr99C7E5hdys5zPQa5FnvplV8jWv"
pipeline, mlb = load_model(model_file_id)
process_song(song_title, artist, pipeline, mlb, api_key="api_key")  # Replace with your actual API key

[INFO] Model already exists at lyric_classifier_model.pkl, skipping download.
[INFO] Model loaded from lyric_classifier_model.pkl

ðŸŽµ Demons - Imagine Dragons
Tags: ['pop', 'rock']
Sentiment: NEGATIVE (0.98)
Prompt: breed greed, demons, demons hide, greed kingdom, negative, pop, rock
[INFO] Image saved to ./outputs/demons_output.png


In [None]:
# # clear outputs images
# import os
# import glob

# for f in glob.glob("./outputs/*.png"):
#     os.remove(f)
# print("[INFO] All .png files in ./outputs have been deleted.")


[INFO] All .png files in ./outputs have been deleted.
