# Spotify Dataset Sentiment Analysis using Naïve Bayes

This notebook implements sentiment classification using MultinomialNB with comprehensive text preprocessing techniques.

**🚀 Google Colab Ready** - All dependencies will be installed automatically!

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/EhsanulHaqueSiam/spotify-review-sentiment/blob/main/spotify_sentiment_analysis_colab.ipynb)

## 🔧 Setup: Install Dependencies (Colab Only)

**Note**: This cell installs required packages. Skip if running locally with existing environment.

In [None]:
# Install required packages for Google Colab
import sys

# Check if running in Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("🔧 Installing packages for Google Colab...")
    
    # Install packages
    !pip install -q kagglehub spacy seaborn
    
    # Download spaCy English model
    !python -m spacy download en_core_web_sm
    
    print("✅ All packages installed successfully!")
else:
    print("📝 Running locally - assuming dependencies are already installed")

print("🎵 Ready to start sentiment analysis!")

## 🔑 Kaggle API Setup (Required for Dataset)

**For Google Colab users**: Use Colab Secrets (recommended), manual credentials, or upload `kaggle.json` file.

In [None]:
import os
import json

if IN_COLAB:
    print("🔑 Setting up Kaggle API for Google Colab...")
    
    try:
        # Option 1: Use Google Colab Secrets (Recommended)
        from google.colab import userdata
        
        print("📱 Using Google Colab Secrets...")
        print("💡 To set up secrets:")
        print("   1. Click the 🔑 key icon in the left sidebar")
        print("   2. Add two secrets:")
        print("      - Name: KAGGLE_USERNAME, Value: your_kaggle_username")
        print("      - Name: KAGGLE_KEY, Value: your_kaggle_api_key")
        print("   3. Enable notebook access for both secrets")
        
        # Get credentials from secrets
        kaggle_username = userdata.get('KAGGLE_USERNAME')
        kaggle_key = userdata.get('KAGGLE_KEY')
        
        # Set environment variables
        os.environ['KAGGLE_USERNAME'] = kaggle_username
        os.environ['KAGGLE_KEY'] = kaggle_key
        
        print("✅ Kaggle credentials loaded from secrets!")
        
    except Exception as e:
        print(f"⚠️ Secrets method failed: {e}")
        print("\n🔧 Fallback options:")
        
        # Option 2: Manual environment variables
        print("\n📝 Option A: Set credentials manually")
        print("Uncomment and fill in the lines below:")
        print("# os.environ['KAGGLE_USERNAME'] = 'your_username'")
        print("# os.environ['KAGGLE_KEY'] = 'your_api_key'")
        
        # Uncomment these lines and add your credentials:
        # os.environ['KAGGLE_USERNAME'] = 'your_username'
        # os.environ['KAGGLE_KEY'] = 'your_api_key'
        
        # Option 3: Upload kaggle.json file
        print("\n📁 Option B: Upload kaggle.json file")
        try:
            from google.colab import files
            print("Click to upload your kaggle.json file:")
            uploaded = files.upload()
            
            if 'kaggle.json' in uploaded:
                !mkdir -p ~/.kaggle
                !cp kaggle.json ~/.kaggle/
                !chmod 600 ~/.kaggle/kaggle.json
                print("✅ Kaggle credentials uploaded successfully!")
            else:
                print("❌ kaggle.json not found in uploaded files")
        except Exception as upload_error:
            print(f"❌ File upload failed: {upload_error}")
            print("Please use manual credentials setup above.")
        
else:
    print("📝 Running locally - using existing Kaggle credentials")

print("\n💡 Get your Kaggle API credentials from: https://www.kaggle.com/settings/account")

## Task 1: Import Required Libraries

In [None]:
# Import basic libraries for data handling
import pandas as pd
import numpy as np

print("Basic libraries imported successfully!")

## Task 2: Load and Explore Dataset

In [None]:
# Import kagglehub for dataset loading
import kagglehub
import os

# Download the Spotify dataset from Kaggle
print("Downloading Spotify dataset from Kaggle...")
try:
    # Download the dataset files to local directory
    path = kagglehub.dataset_download("alexandrakim2201/spotify-dataset")
    print(f"Dataset downloaded to: {path}")
    
    # List files in the downloaded directory
    files = os.listdir(path)
    print(f"Available files: {files}")
    
    # Find the CSV file (should be the main dataset)
    csv_files = [f for f in files if f.endswith('.csv')]
    if csv_files:
        csv_file = csv_files[0]  # Take the first CSV file
        file_path = os.path.join(path, csv_file)
        print(f"Loading CSV file: {csv_file}")
        
        # Load the dataset
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
    else:
        print("No CSV files found in the dataset")
        # Fallback: try to load any file as CSV
        if files:
            file_path = os.path.join(path, files[0])
            df = pd.read_csv(file_path)
            print(f"Loaded {files[0]} as CSV")
        else:
            raise FileNotFoundError("No files found in dataset")
            
except Exception as e:
    print(f"Error downloading from Kaggle: {e}")
    print("Please ensure you have Kaggle API credentials set up.")
    
    if IN_COLAB:
        print("\n🔧 For Google Colab:")
        print("1. Upload kaggle.json file in the previous cell")
        print("2. Or set KAGGLE_USERNAME and KAGGLE_KEY environment variables")
    
    # Create sample data for demonstration if dataset fails to load
    print("\n⚠️ Creating sample data for demonstration...")
    sample_data = {
        'Review': [
            'Great music service, love the playlists!',
            'App crashes frequently, very annoying',
            'Amazing sound quality and user interface',
            'Too expensive for what it offers',
            'Perfect for discovering new music'
        ],
        'label': ['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE']
    }
    df = pd.DataFrame(sample_data)
    print("Sample dataset created for demonstration purposes")

# Display basic information about the dataset
print("\nDataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nDataset description:")
print(df.describe())

## Task 3: Data Preprocessing and Exploration

In [None]:
# Import visualization libraries
import matplotlib.pyplot as plt

# Explore the target variable (sentiment)
# Dataset has 51,473 rows with columns: 'Review' (text) and 'label' (POSITIVE/NEGATIVE)
sentiment_column = 'label'  # Sentiment labels: POSITIVE/NEGATIVE
text_column = 'Review'  # User review text

if sentiment_column in df.columns:
    print("Sentiment distribution:")
    print(df[sentiment_column].value_counts())
    
    # Visualize sentiment distribution
    plt.figure(figsize=(8, 6))
    df[sentiment_column].value_counts().plot(kind='bar')
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Please update the sentiment_column variable with the correct column name")
    print("Available columns:", df.columns.tolist())

## Task 4: Text Preprocessing - Initialize Tools

In [None]:
# Import text preprocessing libraries
import nltk
import spacy
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Download required NLTK data
print("📚 Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
print("✅ NLTK data downloaded")

# Load spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
    print("✅ spaCy English model loaded")
except OSError:
    print("❌ spaCy English model not found")
    if IN_COLAB:
        print("Installing spaCy model...")
        !python -m spacy download en_core_web_sm
        nlp = spacy.load('en_core_web_sm')
        print("✅ spaCy English model installed and loaded")
    else:
        print("Please install spaCy English model: python -m spacy download en_core_web_sm")
        nlp = None

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("🔧 Preprocessing tools initialized!")

## Task 5: Text Preprocessing Functions

In [None]:
def clean_text(text):
    """Basic text cleaning"""
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase (case folding)
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_text(text):
    """Tokenization"""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove stop words"""
    return [token for token in tokens if token not in stop_words and len(token) > 2]

def stem_tokens(tokens):
    """Stemming"""
    return [stemmer.stem(token) for token in tokens]

def lemmatize_tokens(tokens):
    """Lemmatization"""
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text, use_stemming=True, use_lemmatization=False):
    """Complete text preprocessing pipeline"""
    # Clean text
    text = clean_text(text)
    
    # Tokenize
    tokens = tokenize_text(text)
    
    # Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # Apply stemming or lemmatization
    if use_stemming:
        tokens = stem_tokens(tokens)
    elif use_lemmatization:
        tokens = lemmatize_tokens(tokens)
    
    return ' '.join(tokens)

print("🔧 Text preprocessing functions defined!")

## Task 6: Apply Text Preprocessing

In [None]:
# Apply preprocessing to the text data
if text_column in df.columns:
    print("🔄 Applying text preprocessing...")
    
    # Create processed text column
    df['processed_text'] = df[text_column].apply(lambda x: preprocess_text(x, use_stemming=True))
    
    # Show examples of original vs processed text
    print("\n📝 Example of text preprocessing:")
    for i in range(min(3, len(df))):
        print(f"\nOriginal: {df[text_column].iloc[i][:100]}...")
        print(f"Processed: {df['processed_text'].iloc[i][:100]}...")
    
    # Remove empty processed texts
    original_shape = df.shape
    df = df[df['processed_text'].str.len() > 0]
    print(f"\n📊 Dataset shape after preprocessing: {df.shape}")
    if original_shape[0] != df.shape[0]:
        print(f"   Removed {original_shape[0] - df.shape[0]} empty texts")
else:
    print("❌ Please update the text_column variable with the correct column name")
    print("Available columns:", df.columns.tolist())

## Task 7: Feature Extraction using TF-IDF

In [None]:
# Import TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    min_df=2,          # Ignore terms that appear in less than 2 documents
    max_df=0.95,       # Ignore terms that appear in more than 95% of documents
    ngram_range=(1, 2) # Use unigrams and bigrams
)

# Fit and transform the processed text
if 'processed_text' in df.columns:
    print("🔄 Creating TF-IDF features...")
    X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])
    
    print(f"📊 TF-IDF matrix shape: {X_tfidf.shape}")
    print(f"📈 Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
    
    # Show top features
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print(f"\n🔤 Sample features: {feature_names[:20]}")
    print("✅ TF-IDF vectorization completed!")
else:
    print("❌ Processed text not available. Please run the preprocessing step first.")

## Task 8: Prepare Data for Training

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Prepare features and target variables
if sentiment_column in df.columns and 'processed_text' in df.columns:
    print("🔄 Preparing training and testing data...")
    X = X_tfidf
    y = df[sentiment_column]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"📊 Training set shape: {X_train.shape}")
    print(f"📊 Testing set shape: {X_test.shape}")
    print(f"\n📈 Training set sentiment distribution:")
    print(y_train.value_counts())
    print(f"\n📈 Testing set sentiment distribution:")
    print(y_test.value_counts())
    print("✅ Data preparation completed!")
else:
    print("❌ Please ensure both sentiment and processed text columns are available")

## Task 9: Train MultinomialNB Model

In [None]:
# Import MultinomialNB classifier
from sklearn.naive_bayes import MultinomialNB

# Initialize and train MultinomialNB classifier
nb_classifier = MultinomialNB(alpha=1.0)  # Laplace smoothing

# Train the model
print("🤖 Training MultinomialNB classifier...")
nb_classifier.fit(X_train, y_train)

print("✅ Model training completed!")
print(f"📊 Number of classes: {len(nb_classifier.classes_)}")
print(f"🏷️ Classes: {nb_classifier.classes_}")

## Task 10: Make Predictions

In [None]:
# Make predictions on the test set
print("🔮 Making predictions on test set...")
y_pred = nb_classifier.predict(X_test)
y_pred_proba = nb_classifier.predict_proba(X_test)

print("✅ Predictions completed!")
print(f"\n📊 Predicted sentiment distribution:")
print(pd.Series(y_pred).value_counts())

## Task 11: Model Evaluation - Accuracy and Basic Metrics

In [None]:
# Import evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate basic evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("📊 === MODEL PERFORMANCE METRICS ===")
print(f"🎯 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"🎯 Precision (weighted): {precision:.4f}")
print(f"🎯 Recall (weighted): {recall:.4f}")
print(f"🎯 F1-Score (weighted): {f1:.4f}")

# Create a summary dataframe
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Score': [accuracy, precision, recall, f1]
})

print("\n📋 Metrics Summary:")
print(metrics_df)

## Task 12: Detailed Classification Report

In [None]:
# Import classification report
from sklearn.metrics import classification_report

# Generate detailed classification report
print("📊 === DETAILED CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred))

# Convert to dataframe for better visualization
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

print("\n📋 Classification Report as DataFrame:")
print(report_df.round(4))

## Task 13: Confusion Matrix

In [None]:
# Import confusion matrix and seaborn for visualization
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate and visualize confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = nb_classifier.classes_

# Create confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.title('🎯 Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

# Print confusion matrix as dataframe
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("📊 Confusion Matrix:")
print(cm_df)

## Task 14: Feature Analysis

In [None]:
# Analyze most important features for each class
feature_names = tfidf_vectorizer.get_feature_names_out()
n_features = 10  # Top 10 features per class

print("🔍 === MOST IMPORTANT FEATURES BY CLASS ===")
for i, class_label in enumerate(nb_classifier.classes_):
    # Get feature log probabilities for this class
    feature_log_prob = nb_classifier.feature_log_prob_[i]
    
    # Get top features
    top_features_idx = np.argsort(feature_log_prob)[-n_features:]
    top_features = [(feature_names[idx], feature_log_prob[idx]) for idx in top_features_idx]
    
    print(f"\n🏷️ Class: {class_label}")
    for feature, prob in reversed(top_features):
        print(f"   {feature}: {prob:.4f}")

## Task 15: Model Performance Visualization

In [None]:
# Create performance visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Metrics bar plot
metrics_df.plot(x='Metric', y='Score', kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('📊 Model Performance Metrics')
axes[0,0].set_ylabel('Score')
axes[0,0].set_ylim(0, 1)
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].legend().remove()

# 2. Class-wise performance
class_metrics = report_df.iloc[:-3, :3]  # Exclude avg rows and support column
class_metrics.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('📈 Class-wise Performance')
axes[0,1].set_ylabel('Score')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].legend(['Precision', 'Recall', 'F1-Score'])

# 3. Prediction distribution
pred_dist = pd.Series(y_pred).value_counts()
pred_dist.plot(kind='pie', ax=axes[1,0], autopct='%1.1f%%')
axes[1,0].set_title('🥧 Predicted Sentiment Distribution')
axes[1,0].set_ylabel('')

# 4. True vs Predicted comparison
comparison_df = pd.DataFrame({
    'True': y_test.value_counts(),
    'Predicted': pd.Series(y_pred).value_counts()
})
comparison_df.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('📊 True vs Predicted Distribution')
axes[1,1].set_ylabel('Count')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Task 16: Sample Predictions Analysis

In [None]:
# Analyze some sample predictions
sample_indices = np.random.choice(X_test.shape[0], size=5, replace=False)

print("🔍 === SAMPLE PREDICTIONS ANALYSIS ===")
for i, idx in enumerate(sample_indices):
    true_label = y_test.iloc[idx]
    pred_label = y_pred[idx]
    pred_proba = y_pred_proba[idx]
    
    print(f"\n📝 Sample {i+1}:")
    print(f"   True Label: {true_label}")
    print(f"   Predicted Label: {pred_label}")
    print(f"   Prediction Confidence: {max(pred_proba):.4f}")
    
    # Show probabilities for all classes
    for j, class_label in enumerate(nb_classifier.classes_):
        print(f"     P({class_label}): {pred_proba[j]:.4f}")
    
    correct = '✅' if true_label == pred_label else '❌'
    print(f"   Correct: {correct}")

## Task 17: Model Summary and Conclusions

In [None]:
# Final model summary
print("🎵 === MODEL SUMMARY ===")
print(f"📊 Dataset: Spotify Sentiment Analysis")
print(f"🤖 Algorithm: Multinomial Naïve Bayes")
print(f"📈 Total samples: {len(df)}")
print(f"🏋️ Training samples: {len(X_train)}")
print(f"🧪 Testing samples: {len(X_test)}")
print(f"🔤 Number of features: {X_tfidf.shape[1]}")
print(f"🏷️ Number of classes: {len(nb_classifier.classes_)}")

print("\n🔧 === PREPROCESSING TECHNIQUES APPLIED ===")
print("✅ Tokenization")
print("✅ Case folding (lowercase conversion)")
print("✅ Punctuation removal")
print("✅ Stop words removal")
print("✅ Stemming")
print("✅ TF-IDF Vectorization")

print("\n🎯 === FINAL PERFORMANCE ===")
print(f"🎯 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"🎯 Weighted F1-Score: {f1:.4f}")
print(f"🎯 Weighted Precision: {precision:.4f}")
print(f"🎯 Weighted Recall: {recall:.4f}")

print("\n💡 === RECOMMENDATIONS ===")
if accuracy > 0.8:
    print("✅ Model shows good performance")
elif accuracy > 0.7:
    print("⚠️ Model shows moderate performance - consider feature engineering")
else:
    print("⚠️ Model needs improvement - try different preprocessing or algorithms")

print("\n🎉 Analysis completed successfully!")
print("\n🚀 Great job on completing the Spotify sentiment analysis!")

if IN_COLAB:
    print("\n💾 Don't forget to save your results!")
    print("   File → Download → Download .ipynb")