In [None]:
import os
import numpy as np
import pandas as pd
import logging
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import joblib

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Loading data...")
data = pd.read_csv("dataset/data.csv")
genre_data = pd.read_csv('dataset/data_by_genres.csv')
year_data = pd.read_csv('dataset/data_by_year.csv')
logger.info("Data loaded successfully")

# Limit the size of the dataset by sampling a subset
data_sample_size = 1000  # Adjust this number as needed
genre_data_sample_size = 500  # Adjust this number as needed

logger.info(f"Sampling {data_sample_size} rows from data and {genre_data_sample_size} rows from genre_data...")
data = data.sample(n=data_sample_size, random_state=42)
genre_data = genre_data.sample(n=genre_data_sample_size, random_state=42)
logger.info("Data sampling completed")

# Create a pipeline and fit it in one go
cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=10))
])
genre_data['cluster'] = cluster_pipeline.fit_predict(genre_data.select_dtypes(include=[np.number]))

# Create a pipeline for song clustering
song_cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=20, verbose=False))
])
X = data.select_dtypes(np.number)
song_cluster_pipeline.fit(X)
data['cluster_label'] = song_cluster_pipeline.predict(X)

# Create the models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the trained model and data
joblib.dump(song_cluster_pipeline, 'models/song_cluster_pipeline.pkl')
data.to_csv('dataset/data_sampled.csv', index=False)
genre_data.to_csv('dataset/genre_data_sampled.csv', index=False)
logger.info("Model and data saved successfully")


INFO:__main__:Loading data...


INFO:__main__:Data loaded successfully
INFO:__main__:Sampling 1000 rows from data and 500 rows from genre_data...
INFO:__main__:Data sampling completed
