In [None]:
!pip install datasets
!pip install -U sentence-transformers transformers
from datasets import Dataset
import os
import datasets
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from nltk.stem import WordNetLemmatizer
import nltk
import gc
import torch
os.environ["WANDB_MODE"] = "disabled"
nltk.download("wordnet")

data = pd.read_csv('merged_unlabeled_all.csv', encoding='utf-8', on_bad_lines='skip')

lemmatizer = WordNetLemmatizer()
def lemmatize_lyrics(text):
    return ' '.join(['ass' if word == 'ass' else lemmatizer.lemmatize(word) for word in text.split()])

data['lyrics'] = data['lyrics'].apply(lambda x: lemmatize_lyrics(str(x)))

category = 'Sexual'
keyword_col = 'Sexual_words'
keywords = data[keyword_col].dropna().tolist()
split_keywords = []
for phrase in keywords:
    split_keywords.extend([kw.strip() for kw in phrase.split(',')])
all_keywords_cleaned = list(set(split_keywords))

print(f"\n{category} Keywords ({len(all_keywords_cleaned)} total):")
print(", ".join(all_keywords_cleaned))

model = SentenceTransformer('all-MiniLM-L6-v2')

keyword_embeddings = {keyword: model.encode(keyword, convert_to_tensor=True) for keyword in all_keywords_cleaned}

threshold = 0.62
min_threshold = 0.56
window_size = 12

train_examples = []
for index, row in data.iterrows():
    lyric = row['lyrics']
    label = row[category]
    if label == 'T':
        for keyword in all_keywords_cleaned:
            if keyword in lyric:
                start = max(0, lyric.find(keyword) - window_size)
                end = min(len(lyric.split()), start + window_size * 2 + 1)
                segment = ' '.join(lyric.split()[start:end])
                train_examples.append(InputExample(texts=[segment, segment], label=1.0))

bootstrapping_active = True
round = 1
while bootstrapping_active:
    print("=====================================================================================================")
    print("**Current threshold: ", threshold, " | Current Round: ", round)
    model = SentenceTransformer('all-MiniLM-L6-v2')

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3,
        warmup_steps=100,
        show_progress_bar=False
    )

    for index, row in data.iterrows():
        if pd.isna(row[category]):
            lyric = row['lyrics']
            words = lyric.split()
            is_true = False

            for i in range(0, len(words) - 25 + 1, 6):
                segment = ' '.join(words[i:i + 25])
                segment_embedding = model.encode(segment, convert_to_tensor=True)

                similarity_score = max(
                    util.pytorch_cos_sim(segment_embedding, keyword_embedding).item()
                    for keyword_embedding in keyword_embeddings.values()
                )

                if similarity_score > threshold:
                    data.at[index, category] = 'M'
                    print("Threshold: ", threshold, " | Similarity Score: ", similarity_score)
                    print(f"New 'T' sample in {category} category: Row {index + 1}, Segment: '{segment}'")

                    train_examples.append(InputExample(texts=[segment, segment], label=1.0))
                    break

            del segment_embedding
            torch.cuda.empty_cache()

    if threshold > min_threshold:
        threshold -= 0.02
    else:
        bootstrapping_active = False

    # del train_dataloader, train_loss, model
    # torch.cuda.empty_cache()
    # gc.collect()

# data[category].fillna('F', inplace=True)

output_path = 'processed_lyrics_with_labels_Sexual.csv'
data.to_csv(output_path, encoding='utf-8', index=False)
print(f"Final output saved.")




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Sexual Keywords (24 total):
one night stand, orgasm, ass, sexuality, dick, seductive, climax, virgin, bra, tit, erotic, masturbate, blow job, lewd, breast, sex, make love, asshole, sexy, sensual, panty, tease, vagina, nude
**Current threshold:  0.62  | Current Round:  1
{'train_runtime': 8.8423, 'train_samples_per_second': 334.527, 'train_steps_per_second': 21.035, 'train_loss': 0.0092355102621099, 'epoch': 3.0}
Threshold:  0.62  | Similarity Score:  0.6747095584869385
New 'T' sample in Sexual category: Row 78, Segment: 'love with each other with every passing day we share the thought of knowing someone care just being together making love so tenderly exploding into'
Threshold:  0.62  | Similarity Score:  0.6245061159133911
New 'T' sample in Sexual category: Row 1128, Segment: 'only got one night though we can do it twice though it lit at the night show ooh at the night show at the night'
**Current threshold:  0.6  | Current Round:  1
{'train_runtime': 8.9618, 'train_samples_per_secon