In [1]:
!pip install datasets
!pip install -U sentence-transformers transformers
from datasets import Dataset
import os
import datasets
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from nltk.stem import WordNetLemmatizer
import nltk
import gc
import torch
os.environ["WANDB_MODE"] = "disabled"
nltk.download("wordnet")

data = pd.read_csv('merged_unlabeled_all.csv', encoding='utf-8', on_bad_lines='skip')

lemmatizer = WordNetLemmatizer()
def lemmatize_lyrics(text):
    return ' '.join(['ass' if word == 'ass' else lemmatizer.lemmatize(word) for word in text.split()])

data['lyrics'] = data['lyrics'].apply(lambda x: lemmatize_lyrics(str(x)))

category = 'Violence'
keyword_col = 'Violence_words'
keywords = data[keyword_col].dropna().tolist()
split_keywords = []
for phrase in keywords:
    split_keywords.extend([kw.strip() for kw in phrase.split(',')])
all_keywords_cleaned = list(set(split_keywords))

print(f"\n{category} Keywords ({len(all_keywords_cleaned)} total):")
print(", ".join(all_keywords_cleaned))

model = SentenceTransformer('all-MiniLM-L6-v2')

keyword_embeddings = {keyword: model.encode(keyword, convert_to_tensor=True) for keyword in all_keywords_cleaned}

threshold = 0.62
min_threshold = 0.56
window_size = 12

train_examples = []
for index, row in data.iterrows():
    lyric = row['lyrics']
    label = row[category]
    if label == 'T':
        for keyword in all_keywords_cleaned:
            if keyword in lyric:
                start = max(0, lyric.find(keyword) - window_size)
                end = min(len(lyric.split()), start + window_size * 2 + 1)
                segment = ' '.join(lyric.split()[start:end])
                train_examples.append(InputExample(texts=[segment, segment], label=1.0))

bootstrapping_active = True
round = 1
while bootstrapping_active:
    print("=====================================================================================================")
    print("**Current threshold: ", threshold, " | Current Round: ", round)
    model = SentenceTransformer('all-MiniLM-L6-v2')

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3,
        warmup_steps=100,
        show_progress_bar=False
    )

    for index, row in data.iterrows():
        if pd.isna(row[category]):
            lyric = row['lyrics']
            words = lyric.split()
            is_true = False

            for i in range(0, len(words) - 25 + 1, 6):
                segment = ' '.join(words[i:i + 25])
                segment_embedding = model.encode(segment, convert_to_tensor=True)

                similarity_score = max(
                    util.pytorch_cos_sim(segment_embedding, keyword_embedding).item()
                    for keyword_embedding in keyword_embeddings.values()
                )

                if similarity_score > threshold:
                    data.at[index, category] = 'M'
                    print("Threshold: ", threshold, " | Similarity Score: ", similarity_score)
                    print(f"New 'T' sample in {category} category: Row {index + 1}, Segment: '{segment}'")

                    train_examples.append(InputExample(texts=[segment, segment], label=1.0))
                    break

            del segment_embedding
            torch.cuda.empty_cache()

    if threshold > min_threshold:
        threshold -= 0.02
    else:
        bootstrapping_active = False

    # del train_dataloader, train_loss, model
    # torch.cuda.empty_cache()
    # gc.collect()

# data[category].fillna('F', inplace=True)

output_path = 'processed_lyrics_with_labels_Violence.csv'
data.to_csv(output_path, encoding='utf-8', index=False)
print(f"Final output saved.")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

[nltk_data] Downloading package wordnet to /root/nltk_data...



Violence Keywords (11 total):
attack, homicide, slay, knife, gun, punch, stab, murder, kill, smash, assault


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Current threshold:  0.62  | Current Round:  1




{'train_runtime': 3.4378, 'train_samples_per_second': 209.437, 'train_steps_per_second': 13.09, 'train_loss': 0.03277496496836344, 'epoch': 3.0}
Threshold:  0.62  | Similarity Score:  0.6406422853469849
New 'T' sample in Violence category: Row 534, Segment: 'the punch thats the way it go you gotta bend when the wind blow you live you learn you crash and burn it hit or'
**Current threshold:  0.6  | Current Round:  1
{'train_runtime': 2.2225, 'train_samples_per_second': 325.31, 'train_steps_per_second': 21.597, 'train_loss': 0.030416006843249004, 'epoch': 3.0}
**Current threshold:  0.58  | Current Round:  1
{'train_runtime': 2.5428, 'train_samples_per_second': 284.332, 'train_steps_per_second': 18.877, 'train_loss': 0.030416006843249004, 'epoch': 3.0}
Threshold:  0.58  | Similarity Score:  0.5816056728363037
New 'T' sample in Violence category: Row 13, Segment: 'you came up to me and asked me my name you beat me to the punch that time you beat me to the punch you'
**Current threshold:  0