### Libraries

In [5]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder , StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Called Dataset

In [6]:
# Get the current working directory
directorio_actual = os.getcwd()

# Specify the relative path from the current directory
ruta_csv_relativa = os.path.join('..', 'data', '02_intermediate','2.spotifySinOutlier.csv')

# Load the CSV file
spotify = pd.read_csv(ruta_csv_relativa)

# Show the first rows of the DataFrame
spotify.head(10)

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\02_intermediate\\2.spotifySinOutlier.csv'

## Data scaling

This scaling of variables is to be able to work with the models in the future.

### duration_ms Scaler

In [None]:
# Create a StandardScaler object
scaler = MinMaxScaler()

# Select only the popularity column to scale
spotify['duration_ms_scaled'] = scaler.fit_transform(spotify[['log_duration_ms']])

# Show first rows to check
spotify[['duration_ms', 'duration_ms_scaled']].head()

### population Scaler

In [None]:
# Create a StandardScaler object
scaler = MinMaxScaler()

# Select only the popularity column to scale
spotify['popularity_scaled'] = scaler.fit_transform(spotify[['log_popularity']])

# Show first rows to check
spotify[['popularity', 'popularity_scaled']].head()

### Group feature Scaler

In [None]:
# Select numeric features to scale
features_to_scale = ['danceability', 'energy', 'loudness', 'speechiness', 
                     'acousticness', 'instrumentalness', 'liveness', 
                     'valence', 'tempo']

scaler = MinMaxScaler()
spotify[features_to_scale] = scaler.fit_transform(spotify[features_to_scale])

print('Caracteristicas escaladas: ', features_to_scale)

## Creating new features

### duration_ms a tiempo_ms_seconds

The duration_ms was calculated so that the millisecond values are converted to minutes and seconds

In [None]:
# Convert milliseconds to seconds
spotify['tiempo_ms_seconds'] = spotify['duration_ms'] / 1000

# Function to convert seconds to minutes and seconds format
def segundos_a_formato(segundos):
    minutos = int(segundos // 60)
    segundos_restantes = int(segundos % 60)
    return f"{minutos} minuto{'s' if minutos != 1 else ''} con {segundos_restantes} segundo{'s' if segundos_restantes != 1 else ''}"

# Apply the function to the seconds column
spotify['duration_ms_scaled'] = spotify['tiempo_ms_seconds'].apply(segundos_a_formato)

# Show the first 10 formatted values
print(spotify['duration_ms_scaled'].head(10))

### tranck_genre a Label Encoding

Change the categorical label of track_genre to a numeric value.

In [None]:
# Create the tag encoder
le = LabelEncoder()

# Apply Label Encoding to 'track_genre' column
spotify['track_genre_encoded'] = le.fit_transform(spotify['track_genre'])

# View the first rows to verify the encoding
spotify[['track_genre', 'track_genre_encoded']].head()

### New variable "Intensity"

Having a good correlation thanks to the matrix, we decided to choose to combine energy with danceability

In [None]:
# Create a new "intensity" column
spotify['intensity'] = spotify['energy'] * spotify['danceability']
spotify['intensity'].head(10)

#Target Exploration

In [None]:
# Replace infinite values with NaN throughout the DataFrame
spotify.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
# Set graphics style
sns.set(style="whitegrid")
#1. Distribution of characteristics that affect recommendations (danceability, energy, valence)
plt.figure(figsize=(14,6))
# Subplots for each feature
for i, feature in enumerate(['danceability', 'energy', 'valence'], 1):
    plt.subplot(1, 3, i)
    sns.histplot(spotify[feature].dropna(), bins=30, kde=True, color='teal')
    plt.title(f'Distribución de {feature.capitalize()}')
    plt.xlabel(feature.capitalize())
    plt.ylabel('Frecuencia')
plt.tight_layout()
plt.show()
# 2. Relationship between Gender and characteristics for recommendations
plt.figure(figsize=(14,8))
top_genres = spotify['track_genre'].value_counts().nlargest(10) #Top 10 genres
sns.boxplot(x='track_genre', y='danceability', data=spotify[spotify['track_genre'].isin(top_genres.index)].dropna(), palette='coolwarm')
plt.title('Danceability por Género de Canción')
plt.xticks(rotation=90)
plt.show()
#3. Explicit Song Count
plt.figure(figsize=(6,6))
sns.countplot(x='explicit', data=spotify, palette='Set2')
plt.title('Explicit vs. Non-Explicit Songs')
plt.xlabel('Explícito')
plt.ylabel('Conteo')
plt.show()

### Histogram Chart Explanation
Danceability Distribution: The image represents a bell shape, meaning that most songs are danceable, suggesting that it does not greatly affect this field when recommending songs for the user who varies in tastes. Energy Distribution: The graph represents it with values skewed to the right side, meaning that the vast majority of songs have high energy, meaning that if a user opts for 'x' song, it is most likely that the recommendation should be with similar or equal energy levels. Valence Distribution: A uniform distribution,
suggesting that the mix of songs has a high and low positivity, but where the concentration of these values is in the middle of the graph. In summary, the feeling or atmosphere of the songs is mostly related to these three fields where the variability of the values is almost zero, allowing the
song search for the user to be successful most of the time, because no distribution graphs with atypical values were found, but rather, their results are generally close and therefore, successful. Danceability boxplot by song genre: The following graph shows the
relationships of the music genres, where it is highlighted that each box shows the distribution of the Danceability field by music genre. As can be seen, genres such as 'Alt-Rock' and 'Afrobeat' have more danceable songs than genres such as 'ambient' or 'blues'. This helps us to recommend music genres
for the user based on their favorite songs, in this way we have the closest and the most distant genres. Number of songs with explicit and non-explicit lyrics: As you can see, the number of non-explicit songs is around 100,000 songs, compared to songs that do contain explicit lyrics, where the value is around 8,000 to 10,000 songs. This option is crucial for song recommendations, since if the user chooses songs with explicit lyrics, the number of recommendations drops drastically compared to choosing songs without explicit lyrics, although it also gives the option to choose from both options. This is accompanied by business question 3, which shows us the exact number of songs with explicit lyrics by music genre. Conclusion: The graphs shown give us a broad resolution that covers themes of atmosphere or feelings that the songs deliver, similarity between music genres based on their danceability, and the number of songs with and without explicit lyrics. This will allow us to define in a more exact way the recommendations that we give to the user based on their tastes.

In [None]:
spotify.info()

In [None]:
# We select the numerical features that we want to use for the regression
X = spotify[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
             'instrumentalness', 'liveness', 'valence', 'tempo']]  # Adjusts according to the available numeric columns

# Target column
y = spotify['popularity']

# Standardize the characteristics
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the LASSO model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Get the coefficients (feature importance)
importancia = lasso.coef_

# Create a bar chart
caracteristicas = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                   'instrumentalness', 'liveness', 'valence', 'tempo']  # Your number columns
plt.figure(figsize=(10, 6))
plt.barh(caracteristicas, importancia, color='c')
plt.xlabel("Importance of the feature")
plt.ylabel("Characteristics")
plt.title("Feature Importance Using LASSO")
plt.show()

## Explanation
As you can see, using the LASSO method you can determine which are the most important variables of the Dataset (Target). As an observation, you can see the following:

1) The fields 'loudness' and 'danceability' have positive importance compared to the other fields, but in the case of 'loudness' its influence is less compared to the 'danceability' field.
2) Fields such as 'valance', 'liveness', 'instrumentalness', 'acousticness', 'speechiness' and 'energy' have negative importance, which means that for the search of the target field 'popularity', its influence is negative but noticeable.

Conclusion:

As an assessment, we can highlight 3 variables that influence the prediction of song popularity. Fields such as 'danceability' and 'loudness' have a minor but positive influence. On the other hand, the field that has the most influence is 'instrumentalness', which, being of negative impact, the higher the value of this characteristic, the worse its influence when predicting the target field. Therefore, it has been decided that these 3 fields are the most suitable for prediction, since they produce a greater influence compared to the other fields.

# Feature Removal

Se elimino. 
- Unnamed : 0
- artists
- album_name
- track_id
- track_name
- explicit 
- time_signature
- mode

In [None]:
# List of columns you want to delete
columnas_a_eliminar = ['Unnamed: 0', 'artists', 'album_name', 'track_id', 'track_name', 
                       'explicit', 'time_signature', 'mode']

# Remove columns from the DataFrame
spotify = spotify.drop(columns=columnas_a_eliminar)

# Verificar las primeras filas para confirmar que las columnas fueron eliminadas
spotify.head()


## Exploración nuevo DataSet "spotify"

## Correlation Matrix

In [None]:
# Selección de Datos Numéricos
datosNumericos = spotify.select_dtypes(include=[np.number])

# Calcular la matriz de correlación
matriz_correlacion = datosNumericos.corr()

# Mostrar la matriz de correlación
plt.figure(figsize=(8, 6))
sns.heatmap(matriz_correlacion, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={'shrink': .8})
plt.title('Matriz de Correlación')
plt.show()

## Información del DataSet

In [None]:
spotify.info()

## Cantidad total de datos

In [None]:
spotify.dtypes.value_counts()

# Futuro

El objetivo de este proyecto es desarrrollar un sistema de recomendación de música. Utilizando como target popularity y ademas se usara K-Means y K-Vecinos más cercanos (K-NN). Hemos utilizado el procedimiento previo para preparar el conjunto de datos. 
- Escalado de variables: La mayoría de las variables numéricas han sido escaladas. Para asegurar que todas las características sean similares.
- Eliminación de variables categóricas: Las variables categóricas se han eliminado o transformado con el método de Label Encoding, para evitar problemas con los algoritmos que no puedan procesar ese tipo de dato.

El sistema de recomendación funcionará agrupando canciones similares (K-Means) o sugiriendo canciones basadas a las características (K-NN). De esta forma se busca que el sistema pueda recomendar canciones que se alineen con el gusto del usuario.
Cabe recalcar que el método para lograr el objetivo puede cambiar al avanzar con el desarrollo y evaluación de los modelos. Nuestra intención es buscar el mejor modelo para nuestro Recomendador. 

# Save DataSet

In [None]:
rute_cvs_save = os.path.join('..','data','03_primary','3.spotify.csv')
                             
spotify.to_csv (rute_cvs_save, index=False)

In [None]:
# Cargar el archivo CSV
spotify = pd.read_csv(ruta_csv_relativa)
# Reemplazar valores infinitos por NaN en todo el DataFrame
spotify.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
# Configurar estilo de gráficos
sns.set(style="whitegrid")
# 1. Distribución de las características que afectan recomendaciones (danceability, energy, valence)
plt.figure(figsize=(14,6))
# Subplots para cada característica
for i, feature in enumerate(['danceability', 'energy', 'valence'], 1):
    plt.subplot(1, 3, i)
    sns.histplot(spotify[feature].dropna(), bins=30, kde=True, color='teal')
    plt.title(f'Distribución de {feature.capitalize()}')
    plt.xlabel(feature.capitalize())
    plt.ylabel('Frecuencia')
plt.tight_layout()
plt.show()
# 2. Relación entre Género y las características para recomendaciones
plt.figure(figsize=(14,8))
top_genres = spotify['track_genre'].value_counts().nlargest(10)  # Top 10 géneros
sns.boxplot(x='track_genre', y='danceability', data=spotify[spotify['track_genre'].isin(top_genres.index)].dropna(), palette='coolwarm')
plt.title('Danceability por Género de Canción')
plt.xticks(rotation=90)
plt.show()
# 3. Conteo de canciones explícitas
plt.figure(figsize=(6,6))
sns.countplot(x='explicit', data=spotify, palette='Set2')
plt.title('Canciones Explícitas vs No Explícitas')
plt.xlabel('Explícito')
plt.ylabel('Conteo')
plt.show()