# LIMPIEZA DE DATOS MODELO SELECCIONADO

### Preprocesamiento de Datos NØIZE 

## 1. Importación de Bibliotecas

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

## 2. Carga de Datos

In [3]:
df = pd.read_csv("../data/raw/dataset.csv", index_col=0)
print("Cargando y limpiando datos...")

Cargando y limpiando datos...


In [4]:
df.head(3)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


In [34]:
print("\nInformación del dataset inicial:")
print("- Total de registros:", len(df))
print("- Total de características:", len(df.columns))
print("\nMuestra de datos (primeras 5 filas):")
print(df.head())


Información del dataset inicial:
- Total de registros: 114000
- Total de características: 20

Muestra de datos (primeras 5 filas):
                 track_id                 artists  \
0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2       

## 3. Limpieza Inicial de Datos

In [None]:
registros_iniciales = len(df)

df.dropna(inplace=True) #Valores nulos

df.drop_duplicates(inplace=True) #Valores duplicados


In [36]:
print("\nResultados de limpieza:")
print("- Registros eliminados por valores nulos:", registros_iniciales - len(df))
print("- Registros restantes:", len(df))


Resultados de limpieza:
- Registros eliminados por valores nulos: 451
- Registros restantes: 113549


## 4. Preprocesamiento Básico
### Transformaciones básicas en las columnas del dataset.

Convertir columna 'explicit' a numérica

In [37]:
df["explicit"] = df["explicit"].astype(int)

 Convertir duración a minutos

In [38]:
df["duration_min"] = df["duration_ms"] / 60000

 Codificar géneros musicales

In [None]:
le = LabelEncoder()
df["track_genre_encoded"] = le.fit_transform(df["track_genre"])

## 5. Ingeniería de Características
### Nuevas características (features) que podrían ser relevantes para el modelo.

In [40]:
df['energy_loudness'] = df['energy'] * df['loudness']
df['dance_valence'] = df['danceability'] * df['valence']
df['speech_to_acoustic'] = df['speechiness'] / (df['acousticness'] + 0.001)

Correlación de nuevas características

In [41]:
new_features = ['energy_loudness', 'dance_valence', 'speech_to_acoustic']
print(df[new_features + ['popularity']].corr()['popularity'].drop('popularity'))

energy_loudness       0.042470
dance_valence        -0.036484
speech_to_acoustic   -0.063771
Name: popularity, dtype: float64


## 6. Eliminación de Columnas No Necesarias

In [42]:
columnas_eliminar = ["track_id", "artists", "album_name", "track_name", "track_genre", "duration_ms"]
df.drop(columns=columnas_eliminar, inplace=True)

## 7. Guardado de Datos Procesados

In [43]:
df.to_csv("../data/processed/dataset_clean.csv", index=False)

In [44]:
print("\nResumen final:")
print("- Total de registros:", len(df))
print("- Total de características finales:", len(df.columns))


Resumen final:
- Total de registros: 113549
- Total de características finales: 19
