In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy


In [7]:
# Load the CSV files
data = pd.read_csv('training_backup.csv')

In [None]:
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)

    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    return " ".join(lemmatized)

In [None]:
# Apply preprocessing to the text
data['lyrics'] = data['lyrics'].apply(preprocess_text)

In [10]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from typing import List
import torch

def back_translate(texts: List[str], src_lang: str, tgt_lang: str, max_length: int) -> List[str]:
    # Initialize tokenizers and models
    tokenizer_tgt = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}')
    model_tgt = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}')
    tokenizer_src = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{tgt_lang}-{src_lang}')
    model_src = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{tgt_lang}-{src_lang}')

    # Function to translate text and handle max_length
    def translate(text, tokenizer, model):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        with torch.no_grad():
            translated = model.generate(**inputs)
        return tokenizer.decode(translated[0], skip_special_tokens=True)

    # Translate to the target language and back
    src_texts = []
    for text in texts:
        # Translate to target language
        tgt_text = translate(text, tokenizer_tgt, model_tgt)
        # Back-translate to source language
        back_translated_text = translate(tgt_text, tokenizer_src, model_src)
        src_texts.append(back_translated_text)

    return src_texts

# Load your dataset
df = pd.read_csv('training_backup.csv')

# Select the minority class lyrics
minority_lyrics = df[df['mood'] == 4]['lyrics'].tolist()

# Perform back-translation
augmented_lyrics = back_translate(minority_lyrics[:100], 'en', 'fr',512)  # Example with English to French and back to English

# Create a new DataFrame with the augmented data
augmented_df = pd.DataFrame({'lyrics': augmented_lyrics, 'mood': 4})

# Combine the original dataset with the augmented data
augmented_dataset = pd.concat([df, augmented_df])

# Save the augmented dataset
augmented_df.to_csv('augmented_lyrics_mood.csv', index=False)


