In [None]:
# Block 1 - Import Libraries
import pandas as pd
import numpy as np
import nltk
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Block 2 - Data Loading Function
def load_and_explore_dataset(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8').sample(n=1000, random_state=42)
        print("Dataset Information:")
        print(f"Total number of entries (limited): {len(df)}")
        print("\nLanguage Distribution:")
        print(df['language'].value_counts())
        print("\nGenre Distribution:")
        print(df['tag'].value_counts().head())
        df_english = df[df['language'] == 'en'].copy()
        print(f"\nNumber of English lyrics: {len(df_english)}")
        return df_english
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Execute this block
file_path = '/home/deepak/dsciProject/new/song_lyrics.csv'  # Update this path
df_english = load_and_explore_dataset(file_path)

In [None]:
# Block 3 - Text Preprocessing Functions
def clean_lyrics(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [None]:
# Block 4 - Advanced Preprocessing and Sentiment Analysis
def preprocess_lyrics_dataset(df):
    processed_df = df.copy()
    processed_df['cleaned_lyrics'] = processed_df['lyrics'].apply(clean_lyrics)
    processed_df['processed_lyrics'] = processed_df['cleaned_lyrics'].apply(remove_stopwords)
    return processed_df

def analyze_lyrics_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    if polarity > 0.05:
        sentiment = 'Positive'
    elif polarity < -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return {
        'sentiment': sentiment,
        'polarity': polarity,
        'subjectivity': subjectivity
    }

# Execute preprocessing and sentiment analysis
processed_df = preprocess_lyrics_dataset(df_english)
sentiment_results = processed_df['processed_lyrics'].apply(analyze_lyrics_sentiment)
processed_df['sentiment'] = sentiment_results.apply(lambda x: x['sentiment'])
processed_df['polarity'] = sentiment_results.apply(lambda x: x['polarity'])
processed_df['subjectivity'] = sentiment_results.apply(lambda x: x['subjectivity'])

In [None]:
# Block 5 - Visualization Functions
def visualize_sentiment_distribution(df):
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    sentiment_counts = df['sentiment'].value_counts()
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
    plt.title('Sentiment Distribution')
    
    plt.subplot(1, 2, 2)
    genre_sentiment = df.groupby('tag')['sentiment'].value_counts(normalize=True).unstack()
    genre_sentiment.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Proportion')
    plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

def analyze_sentiment_by_features(df):
    yearly_sentiment = df.groupby('year')['sentiment'].value_counts(normalize=True).unstack()
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    yearly_sentiment.plot(kind='line', marker='o')
    plt.title('Sentiment Trends Over Years')
    plt.xlabel('Year')
    plt.ylabel('Proportion')
    plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.subplot(1, 2, 2)
    top_artists = df['artist'].value_counts().head(10).index
    artist_sentiment = df[df['artist'].isin(top_artists)].groupby('artist')['sentiment'].value_counts(normalize=True).unstack()
    artist_sentiment.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution for Top Artists')
    plt.xlabel('Artist')
    plt.ylabel('Proportion')
    plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

# Execute visualizations
visualize_sentiment_distribution(processed_df)
analyze_sentiment_by_features(processed_df)

In [None]:
# Block 6 - Save Results
# Save the processed dataset
processed_df.to_csv('processed_genius_lyrics_dataset.csv', index=False)