In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define special words that should not be changed
special_words = {"ass"}  # Add any other specific words here

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[,\.\!?:()"]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = text.lower()
    words = text.split()
    # Apply only lemmatization, but keep special words unchanged
    words = [word if word in special_words else lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def normalize_word(word):
    # Keep special words unchanged, apply lemmatization otherwise
    return word if word in special_words else lemmatizer.lemmatize(word)

def preprocess_word_dict(word_dict_df):
    normalized_dict = {}
    for category in word_dict_df.columns:
        normalized_dict[category] = []
        for phrase in word_dict_df[category].dropna():
            words = phrase.split()  # Split phrases into individual words
            # Normalize each word in the phrase
            normalized_phrase = ' '.join([normalize_word(word) for word in words])
            normalized_dict[category].append(normalized_phrase)  # Store the normalized phrase
    return normalized_dict

def label_lyrics_by_category(lyrics_df, normalized_dict):
    for category, keywords in normalized_dict.items():
        lyrics_df['normalized_lyrics'] = lyrics_df['lyrics'].apply(
            lambda text: ' '.join([normalize_word(word) for word in text.split()])
        )

        # Use regex to match whole words or phrases accurately
        lyrics_df[category] = lyrics_df['normalized_lyrics'].apply(
            lambda text: 'T' if any(re.search(rf'\b{re.escape(phrase)}\b', text) for phrase in keywords) else 'F'
        )
        lyrics_df[f"{category}_words"] = lyrics_df['normalized_lyrics'].apply(
            lambda text: ', '.join(phrase for phrase in keywords if re.search(rf'\b{re.escape(phrase)}\b', text))
        )

    # Define Explicit_by_def column: T if any category is T, else F
    lyrics_df['Explicit_by_def'] = lyrics_df[['Sexual', 'Violence', 'Substance', 'Language']].apply(
        lambda row: 'T' if 'T' in row.values else 'F', axis=1
    )

    return lyrics_df

def save_combined_results(lyrics_df, output_path):
    lyrics_df.drop(columns=['normalized_lyrics'], inplace=True)
    lyrics_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print("File has been generated successfully!")

# Load data files
word_dict_df = pd.read_csv('expanded_word_dict_2.csv')
lyrics_df = pd.read_csv('clean_unlabeled_utf8.csv')

# Preprocess word dictionary and lyrics
normalized_dict = preprocess_word_dict(word_dict_df)
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(preprocess_text)

# Label lyrics based on category keywords
lyrics_df = label_lyrics_by_category(lyrics_df, normalized_dict)

# Save the results to a single CSV file
output_path = 'labeled_by_dict.csv'
save_combined_results(lyrics_df, output_path)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


File has been generated successfully!
