In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle



In [4]:

# Read the data
data = pd.read_excel('selected_songs.xlsx')



In [5]:
# Assuming the DataFrame has columns 'lyrics' and 'genre'
# You may need to adjust these column names based on the actual dataset
lyrics = data['lyrics']
genres = data['genre']


In [6]:

# Convert genres to a numerical format (for classification)
# If genres are categorical, you can encode them using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)


In [7]:

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(lyrics, encoded_genres, test_size=0.2, random_state=42)


In [8]:

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features to avoid overfitting
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
classifier = RandomForestClassifier(n_estimators=550, max_depth=300, random_state=0)
classifier.fit(X_train_tfidf, y_train)

# Save the model and the vectorizer to pickle files
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [10]:
# Load the model and the vectorizer
with open('random_forest_model.pkl', 'rb') as model_file:
    loaded_classifier = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!


In [11]:
def predict_genre(lyrics, model, vectorizer, label_encoder):
    # Transform the input lyrics into the same feature space as the training data
    lyrics_tfidf = vectorizer.transform([lyrics])
    
    # Predict the genre using the loaded model
    predicted_label = model.predict(lyrics_tfidf)
    
    # Decode the label back to the genre
    predicted_genre = label_encoder.inverse_transform(predicted_label)
    
    return predicted_genre[0]

# Example usage
new_lyrics = """I'm a prisoner of love, so I'm feeling all alone
I can't let go of this emotion, it's deep in my soul"""

predicted_genre = predict_genre(new_lyrics, loaded_classifier, loaded_vectorizer, label_encoder)
print(f"Predicted Genre: {predicted_genre}")


Predicted Genre: prog


In [12]:
import pickle
from sklearn.preprocessing import LabelEncoder

# Save your model
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Save the LabelEncoder (for decoding genres)
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

print("Model, vectorizer, and label encoder saved successfully!")


Model, vectorizer, and label encoder saved successfully!
