In [20]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib
import warnings

# Suppress warnings that are safe to ignore in this context
warnings.filterwarnings("ignore", category=UserWarning)

In [21]:
# --- 1. DATA CLEANING AND MODEL TRAINING (All in one place) ---

print("Starting data loading and preparation...")

# Load dataset (assuming 'dataset.csv' is in the same directory)
try:
    df = pd.read_csv('dataset.csv')
except FileNotFoundError:
    print("Error: 'dataset.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# Data Cleaning steps
df = df.dropna(subset=['artists', 'album_name', 'track_name'])
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=["Unnamed: 0"])
df = df.drop_duplicates(subset=['track_name'], keep='first')

Starting data loading and preparation...


In [22]:
# ðŸŽ¯ CRITICAL FIX FOR KEYERROR: RESET THE INDEX
# This makes the DataFrame index (0, 1, 2, ...) match the positional indices of KNN.
df = df.reset_index(drop=True)

In [23]:
# Feature Engineering
df['explicit'] = df['explicit'].map({True: 1, False: 0})
df['track_search'] = df['track_name'] + " - " + df['artists'].apply(lambda x : ", ".join(x.split(";")))

In [24]:
# --- 2. MOOD FEATURE SIMULATION ---
df['nlp_mood_score'] = df['valence'] * 1.5 - df['acousticness'] * 0.5
df['nlp_mood_score'] = df['nlp_mood_score'].clip(lower=0.0, upper=1.0)

In [25]:
# Define the full list of features for the model
FEATURE_COLUMNS = [
    'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
    'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
    'nlp_mood_score'
]
features = df[FEATURE_COLUMNS]


In [27]:
# --- 3. FEATURE SCALING AND KNN TRAINING ---
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# scaled_df is created with the corrected index
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=FEATURE_COLUMNS)

In [28]:
knn_mood = NearestNeighbors(n_neighbors=10, metric="cosine")
knn_mood.fit(scaled_df)

print("Data preparation and KNN model training complete.")

Data preparation and KNN model training complete.


In [29]:
# --- 4. SAVE ARTIFACTS (Optional but Recommended) ---
joblib.dump(knn_mood, 'knn_mood_model_final.joblib')
joblib.dump(scaler, 'feature_scaler_final.joblib')
df.to_csv('processed_songs_mood_final.csv', index=True)
print("Model, Scaler, and DataFrame saved as '_final' versions.")

Model, Scaler, and DataFrame saved as '_final' versions.


In [30]:
def recommend_mood_songs(song_title_artist, df, knn_model, scaler, feature_cols, n_recommendations=5):
    """
    Finds the input song and recommends similar songs based on combined audio and mood features.
    
    Args:
        song_title_artist (str): The song name or partial search string.
        df (pd.DataFrame): The main processed DataFrame (with clean index).
        knn_model (NearestNeighbors): The trained KNN model.
        scaler (StandardScaler): The fitted scaler object.
        feature_cols (list): The list of features used for training.
        n_recommendations (int): Number of recommendations to return.
        
    Returns:
        list: List of recommended song names or error message.
    """
    
    # 1. Find the song index using the track_search column
    match = df[df['track_search'].str.contains(song_title_artist, case=False, na=False)]
    
    if match.empty:
        return [f"Error: Song '{song_title_artist}' not found in the dataset."]
    
    input_song = match.iloc[0]
    
    # 2. Prepare and scale the input song features
    raw_features = input_song[feature_cols].values.reshape(1, -1)
    scaled_input = scaler.transform(raw_features)
    
    # 3. Find the nearest neighbors
    distances, indices = knn_model.kneighbors(scaled_input, n_neighbors=n_recommendations + 1)
    
    # 4. Extract and format results (This now works due to the index reset in step 1)
    recommended_indices = indices[0][1:]
    recommended_songs_df = df.loc[recommended_indices] 
        
    recommendations_list = []
    for i, row in recommended_songs_df.iterrows():
        recommendations_list.append(row['track_search'])
        
    print(f"\n--- Recommendations for: **{input_song['track_search']}** ---")
    return recommendations_list

# --- 6. EXAMPLE EXECUTION ---

print("\n--- Running Test Recommendation ---")


--- Running Test Recommendation ---


In [31]:
# Run the function using the newly trained model and corrected data
recommendations_1 = recommend_mood_songs(
    song_title_artist='Hold On - Chord Overstreet', 
    df=df, 
    knn_model=knn_mood, 
    scaler=scaler, 
    feature_cols=FEATURE_COLUMNS,
    n_recommendations=7
)

for i, song in enumerate(recommendations_1, 1):
    print(f"{i}. {song}")

print("\nTest complete.")


--- Recommendations for: **Hold On - Chord Overstreet** ---
1. Follow The Sun - Xavier Rudd
2. Thinkin Bout You - Frank Ocean
3. Lay All Your Love On Me - Spotify Singles - Zara Larsson
4. Dead To Me - Slow + Reverb - Whales, Fraxo, Lox Chatterbox
5. Easy On Me - Adele
6. Up Where We Belong - From "An Officer And A Gentleman" - Joe Cocker, Jennifer Warnes
7. Woke Up in Love - Kygo, Gryffin, Calum Scott

Test complete.
