## 1. Install Packages

In [1]:
import os, re, pandas as pd
from pathlib import Path
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

DATA_DIR = Path(r'd:\EE6405 NLP Project\Data Preprocessing\dataset')
files = [DATA_DIR / 'song_lyrics_CarlosGDCJ_en_filtered_en_1M']
files

[WindowsPath('d:/EE6405 NLP Project/Data Preprocessing/dataset/song_lyrics_CarlosGDCJ_en_filtered_en_1M')]

## 2. Load selected CSV

In [2]:
from typing import Optional

def find_text_col(df: pd.DataFrame) -> Optional[str]:
    for cand in ['text','sentence','content','utterance','lyrics','statement']:
        if cand in df.columns:
            return cand
    obj_cols = df.select_dtypes(include='object').columns.tolist()
    return obj_cols[0] if obj_cols else None

def load_one(f: Path) -> Optional[pd.DataFrame]:
    df = pd.read_csv(f)
    text_col = find_text_col(df)
    if text_col is None:
        return None
    df = df.rename(columns={text_col: 'text'})
    df['source_file'] = f.name
    return df

# Keep files list from Cell 1
#files = list(DATA_DIR.glob('*.csv'))
#files
print(f"{len(files)} file(s) scheduled")

1 file(s) scheduled


## 3. Regex cleaning function

In [3]:
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
MENTION_PATTERN = re.compile(r'@[A-Za-z0-9_]+')
HASHTAG_PATTERN = re.compile(r'#(\w+)')
NON_ALPHA_PATTERN = re.compile(r'[^a-zA-Z\s]')
MULTISPACE_PATTERN = re.compile(r'\s+')

def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = URL_PATTERN.sub(' ', text)
    text = MENTION_PATTERN.sub(' ', text)
    text = HASHTAG_PATTERN.sub(r'\1', text)  # keep hashtag word
    text = NON_ALPHA_PATTERN.sub(' ', text)
    text = MULTISPACE_PATTERN.sub(' ', text).strip()
    return text

## 4. Tokenization, Custom Stopword Removal

In [4]:
stop_words = set(stopwords.words('english'))
DOMAIN_STOP = {"chorus","verse","repeat","na","la"}  # extend for lyrics
stop_words |= DOMAIN_STOP

def tokenize_filter(s: str):
    tokens = word_tokenize(s)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

## 5. Stemming vs Lemmatization


In [5]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

## 6. Choose one representation (Lemmatized/Stemmed)

In [6]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['clean_text']   = df['text'].apply(basic_clean)
    df['tokens']       = df['clean_text'].apply(tokenize_filter)
    df['stemmed']      = df['tokens'].apply(stem_tokens)
    df['lemmatized']   = df['tokens'].apply(lemmatize_tokens)
    # Choose representation (lemmatized by default)
    df['final_text']   = df['lemmatized'].apply(lambda toks: ' '.join(toks))
    return df

## 7. POS Tagging

In [7]:
# ...existing code...
import nltk
from collections import Counter

# Ensure POS tagger models are available
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

def add_pos_tags(df: pd.DataFrame) -> pd.DataFrame:
    """Add POS tags for each token list in df['tokens']."""
    df = df.copy()
    if 'tokens' not in df.columns:
        # Fallback: build tokens from clean_text if needed
        df['tokens'] = df['clean_text'].apply(word_tokenize)

    # PTB tags (e.g., NN, VBZ) and Universal tags (e.g., NOUN, VERB)
    df['pos_ptb'] = df['tokens'].apply(lambda toks: nltk.pos_tag(toks))
    df['pos_universal'] = df['tokens'].apply(lambda toks: nltk.pos_tag(toks, tagset='universal'))

    # Counts of Universal POS per doc (useful features)
    df['pos_universal_counts'] = df['pos_universal'].apply(lambda pairs: Counter(tag for _, tag in pairs))
    return df


## TF-IDF & NMF

In [8]:
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import joblib

def _resolve_df_with_final_text():
    glb = globals()
    # Prefer preprocessed dataframes
    if 'df_proc' in glb and isinstance(glb['df_proc'], pd.DataFrame) and 'final_text' in glb['df_proc'].columns:
        return glb['df_proc']
    if 'df' in glb and isinstance(glb['df'], pd.DataFrame) and 'final_text' in glb['df'].columns:
        return glb['df']
    # Fallback: load from `files` using your helpers
    try:
        srcs = files
    except NameError:
        raise ValueError("Provide a DataFrame (df_proc/df) with 'final_text' or define 'files'.")
    csvs = []
    for f in srcs:
        p = Path(f)
        if p.is_dir():
            csvs.extend(sorted(p.glob('*.csv')))
            continue
        if p.suffix.lower() != '.csv':
            cand = p.with_suffix('.csv')
            if cand.exists():
                p = cand
        if p.exists() and p.is_file() and p.suffix.lower() == '.csv':
            csvs.append(p)
    if not csvs:
        raise ValueError("No CSVs resolved from 'files'.")
    dfs = []
    for f in csvs:
        d = load_one(Path(f))
        if d is not None:
            dfs.append(preprocess_df(d))
    if not dfs:
        raise ValueError("No valid dataframes built. Check text column detection.")
    return pd.concat(dfs, ignore_index=True)

# Get corpus
df_nmf = _resolve_df_with_final_text()
corpus = df_nmf['final_text'].fillna('')

# TF-IDF
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    lowercase=False  # already lowercased in cleaning
)
X = tfidf.fit_transform(corpus)

# NMF
n_topics = 10  # tune as needed
nmf = NMF(
    n_components=n_topics,
    init='nndsvd',
    random_state=42,
    max_iter=400,
    alpha_W=0.0,
    l1_ratio=0.0
)
W = nmf.fit_transform(X)  # doc-topic matrix
H = nmf.components_      # topic-term matrix
feat = tfidf.get_feature_names_out()

# Show top terms per topic
topn = 12
for k, comp in enumerate(H):
    top_idx = np.argsort(comp)[::-1][:topn]
    terms = [feat[i] for i in top_idx]
    print(f"Topic {k}: {', '.join(terms)}")

# Attach dominant topic to dataframe
df_nmf['nmf_topic'] = W.argmax(axis=1)
df_nmf['nmf_strength'] = W.max(axis=1)

# Optional: persist artifacts
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(nmf, 'nmf_model.joblib')
joblib.dump(W, 'nmf_doc_topic.joblib')
joblib.dump(H, 'nmf_topic_term.joblib')
df_nmf.to_csv('df_with_nmf.csv', index=False)

print("NMF done. Doc-topic shape:", W.shape)

Topic 0: know, never, time, could, thing, say, think, cause, always, want, see, tell
Topic 1: nigga, bitch, shit, fuck, got, like, get, money, hook, hit, as, hoe
Topic 2: baby, girl, wan, let, want, got, know, come, gon, get, tonight, take
Topic 3: man, one, well, little, like, old, said, good, people, two, year, first
Topic 4: eye, life, light, world, soul, see, sky, god, heart, fire, death, blood
Topic 5: love, love love, heart, know love, true, love like, give, say love, love know, fall love, forever, need love
Topic 6: yeah, yeah yeah, yeah know, yeah got, woah, intro, got, hey, outro, know yeah, ayy, huh
Topic 7: away, day, home, night, way, come, long, back, gone, far, stay, time
Topic 8: feel, feel like, like, make feel, feeling, make, wan, know feel, inside, feel feel, real, feel good
Topic 9: ooh, ooh ooh, pre, ooh yeah, yeah ooh, outro, bridge, ooh baby, love ooh, outro ooh, post, oooh
NMF done. Doc-topic shape: (1000000, 10)


## BM25

In [9]:
import sys, subprocess
try:
    from rank_bm25 import BM25Okapi  # or BM25Plus
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rank-bm25"])
    from rank_bm25 import BM25Okapi

import numpy as np

def _df_for_bm25():
    glb = globals()
    if 'df_proc' in glb and isinstance(glb['df_proc'], pd.DataFrame):
        return glb['df_proc']
    try:
        return _resolve_df_with_final_text()
    except Exception as e:
        raise ValueError("No dataframe available. Run preprocessing first.") from e

def _docs_tokens(df_base: pd.DataFrame):
    if 'final_text' in df_base.columns:
        return df_base['final_text'].fillna('').str.split().tolist()
    if 'tokens' in df_base.columns:
        return df_base['tokens'].tolist()
    if 'clean_text' in df_base.columns:
        return df_base['clean_text'].fillna('').apply(word_tokenize).tolist()
    return df_base['text'].fillna('').apply(lambda s: word_tokenize(basic_clean(s))).tolist()

def _prep_query_tokens(q: str):
    q_clean = basic_clean(q)
    toks = tokenize_filter(q_clean)
    return lemmatize_tokens(toks)

# Build index once
df_bm = _df_for_bm25()
docs_tokens = _docs_tokens(df_bm)
bm25 = BM25Okapi(docs_tokens, k1=1.5, b=0.75)

def bm25_search_rank(df_base: pd.DataFrame, query: str, top_k=10):
    q_toks = _prep_query_tokens(query)
    scores = bm25.get_scores(q_toks)
    top_idx = np.argsort(scores)[::-1][:top_k]
    out = df_base.iloc[top_idx].copy()
    out['bm25_score'] = np.array(scores)[top_idx]
    cols = [c for c in ['bm25_score', 'final_text', 'text', 'source_file'] if c in out.columns]
    return out[cols]

# Example
results = bm25_search_rank(df_bm, "happy joyful excited", top_k=10)
print(results.head(10))

        bm25_score                                         final_text  \
795175   28.339221  fool dream laugh loud dance dance proud knowle...   
124056   19.704714  clown jumping laugh scream jump shout audience...   
868944   18.447535  hark bell sweet silver bell seem say throw car...   
499370   17.943365  come faithful joyful triumphant come come beth...   
604873   17.938690  intro let hello woke feeling great forgot desp...   
908222   17.784529  know sensitive people feel much mood seems coo...   
701438   17.567383  woke smoked grabbed newspaper ignore name sad ...   
907977   17.149233  waoh waoh everything seems sad everything like...   
869094   16.925925  hark bell sweet silver bell seem say throw car...   
691042   16.870400  bring block bell join along let play happy hol...   

                                                     text  \
795175  I'm a fool who dreams\nI laugh to loud\nAnd wh...   
124056  The clowns are jumping in and out\nThey laugh,...   
868944  [Vers