In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE

In [None]:


# Load dataset
df = pd.read_csv("Tamil_Music_Dataset.csv")

# Encode categorical variables
label_encoders = {}
categorical_cols = ["Year", "Artist", "Song", "Genre", "Listener Gender", "Listener Genre Preference", "Artist Genre"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Add relational parameters
df['Artist_Song'] = df['Artist'] * 100 + df['Song']  # Encodes relationship between artist and song
df['Age_GenrePref'] = df['Listener Age'] * 10 + df['Listener Genre Preference']  # Encodes age and preference relationship

df['Song_Popularity'] = df['Song Popularity']  # Include song popularity as a feature
df['Artist_Genre'] = df['Artist Genre']  # Include artist's dominant genre

df['Artist_Age'] = df['Artist'] * 50 + df['Listener Age']  # Additional relation between artist and listener age
df['Song_Genre_Pop'] = df['Song'] * 10 + df['Song_Popularity']  # Song relation with popularity

df['Combined_Feature'] = df['Artist_Song'] + df['Age_GenrePref'] + df['Song_Genre_Pop']  # Composite feature

# Apply clustering to create cluster labels
feature_cols = ["Year", "Artist", "Song", "Listener Age", "Listener Gender", "Listener Genre Preference", "Artist_Song", "Age_GenrePref", "Song_Popularity", "Artist_Genre", "Artist_Age", "Song_Genre_Pop", "Combined_Feature"]
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[feature_cols])

# Features and target
X = df.drop(columns=["Genre"])
y = df["Genre"]

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Standardize numerical values
scaler = StandardScaler()
num_cols = ['Listener Age', 'Artist_Song', 'Age_GenrePref', 'Song_Popularity', 'Artist_Age', 'Song_Genre_Pop', 'Combined_Feature']
X_resampled[num_cols] = scaler.fit_transform(X_resampled[num_cols])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Function to predict genre
def predict_genre(year, artist, song, age, gender, genre_pref, song_popularity, artist_genre):
    input_data = pd.DataFrame([[year, artist, song, age, gender, genre_pref, song_popularity, artist_genre]], 
                              columns=["Year", "Artist", "Song", "Listener Age", "Listener Gender", "Listener Genre Preference", "Song Popularity", "Artist Genre"])
    
    # Convert categorical inputs using LabelEncoders
    for col in categorical_cols:
        if col in input_data.columns:
            input_data[col] = label_encoders[col].transform(input_data[col])
    
    # Compute relational parameters
    input_data["Artist_Song"] = input_data["Artist"] * 100 + input_data["Song"]
    input_data["Age_GenrePref"] = input_data["Listener Age"] * 10 + input_data["Listener Genre Preference"]
    input_data["Artist_Age"] = input_data["Artist"] * 50 + input_data["Listener Age"]
    input_data["Song_Genre_Pop"] = input_data["Song"] * 10 + input_data["Song Popularity"]
    input_data["Combined_Feature"] = input_data["Artist_Song"] + input_data["Age_GenrePref"] + input_data["Song_Genre_Pop"]
    
    # Assign cluster based on KMeans using the same feature columns as training
    input_data['Cluster'] = kmeans.predict(input_data[feature_cols])
    
    # Standardize numerical values
    input_data[num_cols] = scaler.transform(input_data[num_cols])
    
    # Ensure feature columns match training data
    input_data = input_data[X.columns]
    
    genre_pred = best_rf.predict(input_data)
    return label_encoders["Genre"].inverse_transform(genre_pred)[0]

# Example prediction
print("Predicted Genre:", predict_genre("2010s", "A. R. Rahman", "Munbe Vaa", 25, "Male", "Melody", 80, "Melody"))
