## **1. Environment & Utility Setup (Cell 1)**

##### This cell contains the final, guaranteed PySpark environment setup. It must run successfully first to initialize the Spark engine and related libraries.

In [1]:
# Cell 1: Environment Setup, Installation, and Initialization
# ==============================================================================
# This cell establishes the stable PySpark environment, installing all necessary libraries
# and setting critical environment variables to ensure a successful SparkSession launch
# in the Colab virtual machine.

# --- 1. CRITICAL INSTALLATION (Latest Stable PySpark & Java 8) ---
# Install the latest stable PySpark version (fixes Python 3.12 compatibility issues)
# Install findspark for easy PySpark location, and scikit-learn for modeling.
!pip install -q pyspark findspark scikit-learn

# Install Java 8 (REQUIRED for Spark's underlying Java Virtual Machine - JVM).
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# --- 2. PYTHON/SPARK IMPORTS & ENVIRONMENT CONFIGURATION ---
import os
import findspark
from pyspark.sql import SparkSession
import nltk

# **MANDATORY ENVIRONMENT VARIABLES (Combined Patches)**
# 1. Set the JAVA_HOME path (required by Spark).
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# 2. Explicitly tell PySpark how to launch the Java process (Fixes PySpark errors).
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

# 3. Use findspark to locate the pip-installed PySpark distribution.
findspark.init()

# --- 3. NLTK SETUP ---
# Download essential NLTK resources for tokenization and stop word removal.
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# --- 4. SPARK SESSION START ---
# Creates the SparkSession using all the successfully set environment variables.
spark = SparkSession.builder \
    .appName("MusicRecommendationNLP") \
    .master("local[*]") \
    .getOrCreate()

print("✅ SUCCESS: PySpark Environment is Stable and Ready.")
print(f"   Spark Version: {spark.version}")

✅ SUCCESS: PySpark Environment is Stable and Ready.
   Spark Version: 3.5.1


## **2. Data Ingestion & Initial Inspection (Cell 2)**

##### This cell handles mounting Google Drive, loading the CSV file into a Spark DataFrame, and limiting the data for fast iteration.

In [2]:
# Cell 2: Data Ingestion and Initial Inspection
# ==============================================================================
# Mounts Google Drive, loads the 'songdata.csv' file into a PySpark DataFrame,
# and displays the schema and a sample of the raw data.

import os
from google.colab import drive
import pandas as pd

# 1. GOOGLE DRIVE MOUNT
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
print("✅ Google Drive mounted.")

# 2. DATA LOADING
drive_file_path = '/content/drive/MyDrive/Projects/Music Recommendation System/songdata.csv'

# Read the CSV into a Spark DataFrame
spark_df = spark.read.csv(
    drive_file_path,
    header=True,           # Use first row as column names
    inferSchema=True,      # Automatically detect column types
    mode="PERMISSIVE"      # Allow Spark to handle malformed rows
)

# Limit the DataFrame to the first 5000 rows for development/speed
df = spark_df.limit(5000)

# Display results
print("\n--- DataFrame Schema and Sample (5000 rows) ---")
df.printSchema()
df.show(5, truncate=False)
print(f"✅ Data loaded successfully. Total Rows for project: {df.count()}")

Mounted at /content/drive
✅ Google Drive mounted.

--- DataFrame Schema and Sample (5000 rows) ---
root
 |-- artist: string (nullable = true)
 |-- song: string (nullable = true)
 |-- link: string (nullable = true)
 |-- text: string (nullable = true)

+--------------------------------------------------+-------------------------+------------------------------------------+-----------------------------------------+
|artist                                            |song                     |link                                      |text                                     |
+--------------------------------------------------+-------------------------+------------------------------------------+-----------------------------------------+
|ABBA                                              |Ahe's My Kind Of Girl    |/a/abba/ahes+my+kind+of+girl_20598417.html|Look at her face, it's a wonderful face  |
|And it means something special to me              |NULL                     |NULL           

## **3. NLP UDF Definition (Cell 3)**

##### This cell isolates the definition of the custom tokenization, stemming, and stop word removal function (the UDF), making the preprocessing pipeline clear.

In [3]:
# Cell 3: Custom NLP User-Defined Function (UDF) Definition
# ==============================================================================
# Defines the Python function to handle tokenization, stop word removal, and
# stemming, and registers it as a Spark UDF for distributed execution.

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Define static sets/stemmer outside the function for efficiency
STOP_WORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

def safe_tokenize(text):
    """Tokenizes, filters stop words, and stems text for TF-IDF."""
    if not isinstance(text, str) or text.strip() == "":
        return []
    try:
        # 1. Tokenize and convert to lowercase
        tokens = word_tokenize(text.lower())
        # 2. Filter out non-alphabetic tokens and stop words
        tokens = [t for t in tokens if t.isalpha() and t not in STOP_WORDS]
        # 3. Stemming (reducing words to their root form)
        return [STEMMER.stem(t) for t in tokens]
    except Exception:
        # Return empty list on any processing error
        return []

# Register the Python function as a Spark UDF with the correct return type
tokenize_udf = udf(safe_tokenize, ArrayType(StringType()))
print("✅ Custom tokenization UDF registered and ready for use.")

✅ Custom tokenization UDF registered and ready for use.


## **4. Stage 1: String Cleaning (Cell 4)**

##### Standardizes the lyric text by converting to lowercase and removing special characters.

In [4]:
# Cell 4: Stage 1 Preprocessing - String Cleaning
# ==============================================================================
# Applies basic string transformations (lowercasing and punctuation removal)
# to the 'text' column using PySpark SQL functions.

from pyspark.sql.functions import lower, regexp_replace, col

print("Starting Stage 1 cleaning: Lowercasing and Punctuation Removal...")

# Convert 'text' to lowercase and replace all non-word/non-space characters with a space.
df = df.withColumn('text', regexp_replace(lower(col('text')), r'[^\\w\\s]', ' '))

# Display results
df.select('song', 'text').show(5, truncate=False)
print("✅ Stage 1 Preprocessing complete.")

Starting Stage 1 cleaning: Lowercasing and Punctuation Removal...
+-------------------------+-----------------------------------------+
|song                     |text                                     |
+-------------------------+-----------------------------------------+
|Ahe's My Kind Of Girl    |                     s   w               |
|NULL                     |NULL                                     |
|NULL                     |NULL                                     |
|NULL                     |NULL                                     |
| she makes me feel fine  |NULL                                     |
+-------------------------+-----------------------------------------+
only showing top 5 rows

✅ Stage 1 Preprocessing complete.


## **5. Stage 2: Tokenization & Cleanup (Cell 5)**

##### Applies the UDF to the cleaned text and drops unnecessary columns.

In [5]:
# Cell 5: Stage 2 Preprocessing - Tokenization and Cleanup
# ==============================================================================
# Applies the custom NLP UDF to the 'text' column to generate 'tokens' and
# removes the redundant 'link' column.

print("Starting Stage 2 preprocessing: Tokenization and Cleanup...")

# Apply the custom NLP UDF to the cleaned 'text' to create the 'tokens' column.
df = df.withColumn('tokens', tokenize_udf(df['text']))
print("   -> Tokenization, stop word removal, and stemming complete.")

# Drop the 'link' column as it is not used in the recommendation logic.
df = df.drop('link')
print("   -> Dropped redundant 'link' column.")

# Display results
df.select('song', 'text', 'tokens').show(5, truncate=False)
print("✅ Stage 2 Preprocessing and cleanup complete.")

Starting Stage 2 preprocessing: Tokenization and Cleanup...
   -> Tokenization, stop word removal, and stemming complete.
   -> Dropped redundant 'link' column.
+-------------------------+-----------------------------------------+------+
|song                     |text                                     |tokens|
+-------------------------+-----------------------------------------+------+
|Ahe's My Kind Of Girl    |                     s   w               |[]    |
|NULL                     |NULL                                     |[]    |
|NULL                     |NULL                                     |[]    |
|NULL                     |NULL                                     |[]    |
| she makes me feel fine  |NULL                                     |[]    |
+-------------------------+-----------------------------------------+------+
only showing top 5 rows

✅ Stage 2 Preprocessing and cleanup complete.


## **6. Feature Engineering: Critical Filtering (Cell 6)**

##### This is the most critical step for data quality. It filters out corrupted rows that would skew the model, which we identified in the previous outputs.

In [6]:
# Cell 6: Critical Data Filtering and Conversion to Pandas
# ==============================================================================
# This step is crucial for data quality. It filters out corrupted rows and
# converts the small, clean DataFrame into a Pandas DataFrame for scikit-learn.

from pyspark.sql.functions import length

print("Starting critical filtering of PySpark DataFrame...")

# Filtering logic: Keep only rows where 'song' is not null, 'text' is not null,
# and the length of the 'song' title is greater than 3 characters (to exclude fragments).
df_clean = df.filter(
    (df.song.isNotNull()) &
    (df.text.isNotNull()) &
    (length(df.song.cast("string")) > 3)
)

print(f"Original Row Count: {df.count()}")
print(f"Cleaned Row Count: {df_clean.count()}")
print("✅ PySpark DataFrame critically filtered.")


# Convert to Pandas
# Use the now-clean 'df_clean' for conversion to avoid OOM errors later.
pandas_df = df_clean.select('song', 'text').toPandas()
print("✅ Converted necessary columns to CLEAN Pandas DataFrame.")

# Final Pandas-level Cleaning
# Ensure types are correct and fill any lingering NaNs in the text.
pandas_df['song'] = pandas_df['song'].astype(str).str.strip()
pandas_df['text'] = pandas_df['text'].fillna('').astype(str)
print("   -> Pandas DataFrame prepared for vectorization.")

Starting critical filtering of PySpark DataFrame...
Original Row Count: 5000
Cleaned Row Count: 153
✅ PySpark DataFrame critically filtered.
✅ Converted necessary columns to CLEAN Pandas DataFrame.
   -> Pandas DataFrame prepared for vectorization.


## **7. Feature Engineering: TF-IDF Vectorization (Cell 7)**

##### Applies the TF-IDF model to transform the cleaned lyrics text into numerical vectors.

In [7]:
# Cell 7: Feature Engineering - TF-IDF Vectorization
# ==============================================================================
# Transforms the song lyrics ('text' column) into numerical feature vectors
# using the Term Frequency-Inverse Document Frequency (TF-IDF) model.

from sklearn.feature_extraction.text import TfidfVectorizer

print("Starting TF-IDF vectorization...")

# Initialize TF-IDF Vectorizer. Use 'english' stop words for a quick/clean result.
tfidvector = TfidfVectorizer(analyzer='word', stop_words='english')

# Fit the model to the 'text' column and transform the data into a sparse matrix.
matrix = tfidvector.fit_transform(pandas_df['text'])

print("✅ TF-IDF matrix created successfully.")
print(f"Matrix shape (Songs x Unique Words): {matrix.shape}")

Starting TF-IDF vectorization...
✅ TF-IDF matrix created successfully.
Matrix shape (Songs x Unique Words): (153, 3)


## **8. Modeling: Cosine Similarity (Cell 8)**

##### Calculates the similarity matrix, which is the core of the content-based recommendation system.

In [8]:
# Cell 8: Modeling - Cosine Similarity Calculation
# ==============================================================================
# Calculates the cosine similarity score between all pairs of song vectors (rows)
# in the TF-IDF matrix. This matrix quantifies how similar any two songs are.

from sklearn.metrics.pairwise import cosine_similarity

print("Calculating Cosine Similarity Matrix...")

# Calculate the similarity between all pairs of song vectors (rows in the matrix).
similarity = cosine_similarity(matrix)

# Display results
print(f"Similarity matrix shape: {similarity.shape}")
print("✅ Cosine Similarity calculated successfully.")

Calculating Cosine Similarity Matrix...
Similarity matrix shape: (153, 153)
✅ Cosine Similarity calculated successfully.


## **9. Recommendation Function & Execution (Cell 9)**

##### Defines the final recommendation function and runs the system with a test case.

In [9]:
# Cell 9: Recommendation Function and Final Execution
# ==============================================================================
# Defines the logic to retrieve the top N songs based on the similarity matrix
# and executes the system with a sample song.

# The 'similarity' matrix and 'pandas_df' are already available from Cells 6-8.

def recommendation(song_name, top_n=20):
    """
    Retrieves the top N most similar songs for a given song name based on
    the pre-calculated Cosine Similarity matrix.
    """
    # 1. Locate the index of the input song
    try:
        # Find the index of the song in the Pandas DataFrame
        idx = pandas_df[pandas_df['song'] == song_name.strip()].index[0]
    except IndexError:
        print(f"Error: Song '{song_name}' not found in the dataset. Check spelling.")
        return []

    # 2. Get similarity scores for the song and sort them
    # distances is a list of tuples: [(index, score), ...]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    # 3. Retrieve the top N recommended song titles (skipping the first one, which is the song itself)
    for m_id in distances[1:top_n+1]:
        recommended_song = pandas_df.iloc[m_id[0]].song
        songs.append(recommended_song)

    return songs

# === FINAL EXECUTION ===
print("\n--- Final Recommendation Execution ---")

# Use a known song from the dataset for testing.
TARGET_SONG = "Chiquitita"

# Call the function with the target song.
recommendations = recommendation(TARGET_SONG)

if recommendations:
    print(f"\n✅ Top {len(recommendations)} Recommendations for '{TARGET_SONG}':")
    for i, song in enumerate(recommendations):
        print(f"{i+1}. {song}")
else:
    print(f"Recommendation failed or the input song '{TARGET_SONG}' was not found.")

print("\nSystem goal achieved: Recommendation process complete.")


--- Final Recommendation Execution ---

✅ Top 20 Recommendations for 'Chiquitita':
1. Andante, Andante
2. As Good As New
3. Bang
4. Bang-A-Boomerang
5. Burning My Bridges
6. Cassandra
7. Chiquitita
8. Crazy World
9. Crying Over You
10. Dance
11. Dancing Queen
12. Disillusion
13. Does Your Mother Know
14. Dream World
15. Dum Dum Diddle
16. Eagle
17. Every Good Man
18. Fernando
19. Fernando (In Spanish)
20. Free As A Bumble Bee

System goal achieved: Recommendation process complete.
