# Lyrics Sentiment Analysis Preprocessing

## Project Overview
This notebook implements preprocessing steps for lyrics sentiment analysis using a lexicon-based approach.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import nltk
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt

In [None]:
# Data Loading Function
def load_lyrics_dataset(file_path):
    """
    Load lyrics dataset from a CSV file.
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file containing lyrics
    
    Returns:
    --------
    pandas.DataFrame
        Loaded dataset
    """
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Dataset loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

In [None]:
# Text Preprocessing Functions
def clean_text(text):
    """
    Clean the text by:
    1. Converting to lowercase
    2. Removing special characters
    3. Removing extra whitespaces
    
    Parameters:
    -----------
    text : str
        Input text to clean
    
    Returns:
    --------
    str
        Cleaned text
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def remove_stopwords(text):
    """
    Remove stopwords from the text
    
    Parameters:
    -----------
    text : str
        Input text
    
    Returns:
    --------
    str
        Text with stopwords removed
    """
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def lemmatize_text(text):
    """
    Lemmatize the text
    
    Parameters:
    -----------
    text : str
        Input text
    
    Returns:
    --------
    str
        Lemmatized text
    """
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized_text)

In [None]:
# Lexicon-based Sentiment Analysis Function
def get_sentiment(text):
    """
    Determine sentiment using TextBlob
    
    Parameters:
    -----------
    text : str
        Input text
    
    Returns:
    --------
    str
        Sentiment label (Positive/Negative/Neutral)
    """
    # Use TextBlob for sentiment analysis
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    
    if polarity > 0.05:
        return 'Positive'
    elif polarity < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
# Preprocessing Pipeline
def preprocess_lyrics(df, lyrics_column='lyrics'):
    """
    Apply preprocessing steps to lyrics
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe with lyrics
    lyrics_column : str, optional
        Name of the column containing lyrics
    
    Returns:
    --------
    pandas.DataFrame
        Preprocessed dataframe
    """
    # Create a copy of the dataframe
    processed_df = df.copy()
    
    # Apply preprocessing steps
    processed_df['cleaned_lyrics'] = processed_df[lyrics_column].apply(clean_text)
    processed_df['lyrics_no_stopwords'] = processed_df['cleaned_lyrics'].apply(remove_stopwords)
    processed_df['lemmatized_lyrics'] = processed_df['lyrics_no_stopwords'].apply(lemmatize_text)
    
    # Perform sentiment analysis
    processed_df['sentiment'] = processed_df['lemmatized_lyrics'].apply(get_sentiment)
    
    return processed_df

In [None]:
# Visualization Functions
def plot_sentiment_distribution(df):
    """
    Create a pie chart of sentiment distribution
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Preprocessed dataframe with sentiment
    """
    # Count sentiments
    sentiment_counts = df['sentiment'].value_counts()
    
    # Create pie chart
    plt.figure(figsize=(10, 6))
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
    plt.title('Sentiment Distribution in Lyrics')
    plt.show()

def word_frequency_analysis(df, lyrics_column='lemmatized_lyrics'):
    """
    Perform basic word frequency analysis
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Preprocessed dataframe
    lyrics_column : str, optional
        Column containing processed lyrics
    
    Returns:
    --------
    pandas.DataFrame
        Top frequent words
    """
    # Combine all lyrics
    all_lyrics = ' '.join(df[lyrics_column])
    
    # Tokenize and count
    word_tokens = word_tokenize(all_lyrics)
    word_freq = nltk.FreqDist(word_tokens)
    
    # Convert to dataframe
    freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['Frequency'])
    freq_df.index.name = 'Word'
    freq_df = freq_df.reset_index().sort_values('Frequency', ascending=False)
    
    return freq_df.head(20)

In [None]:
# Main Execution
def main():
    # Load the dataset
    # Replace 'path/to/your/lyrics_dataset.csv' with actual path
    df = load_lyrics_dataset('path/to/your/lyrics_dataset.csv')
    
    if df is not None:
        # Preprocess lyrics
        processed_df = preprocess_lyrics(df)
        
        # Visualize sentiment distribution
        plot_sentiment_distribution(processed_df)
        
        # Perform word frequency analysis
        top_words = word_frequency_analysis(processed_df)
        print("Top 20 Most Frequent Words:")
        print(top_words)
        
        # Optional: Save processed dataset
        processed_df.to_csv('processed_lyrics_dataset.csv', index=False)

# Run the main function
if __name__ == '__main__':
    main()