<a href="https://colab.research.google.com/github/2020Nina/TP_2_SPOTIFY_recupera/blob/main/Spotify_Corregido_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
path = '/content/drive/MyDrive/Colab Notebooks/SPOTIFY_CANCIONES/'
filename = 'Canciones_Spotify.csv'
fullname = path + filename
df = pd.read_csv(fullname)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


In [12]:
X = df.drop(["target", "song_title", "artist", "Unnamed: 0"], axis=1)
Y = df["target"]

# División en entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Escalamiento de datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Inicializar clasificadores
knn_classifier = KNeighborsClassifier()
svm_classifier = SVC()
dt_classifier = DecisionTreeClassifier()
nb_classifier = GaussianNB()
rf_classifier = RandomForestClassifier()

# Crear un clasificador de votación
voting_classifier = VotingClassifier(estimators=[
    ('knn', knn_classifier),
    ('svm', svm_classifier),
    ('dt', dt_classifier),
    ('nb', nb_classifier),
    ('rf', rf_classifier)
], voting='hard')

# Lista de clasificadores
clasificadores = [knn_classifier, svm_classifier, dt_classifier, nb_classifier, rf_classifier, voting_classifier]

# Realizar validación cruzada con KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for clf in clasificadores:
    # Obtener las predicciones de la validación cruzada
    Y_pred_cv = cross_val_predict(clf, X_train_scaled, Y_train, cv=kf)

    # Calcular y mostrar la matriz de confusión
    cm = confusion_matrix(Y_train, Y_pred_cv)
    print(f"Clasificador: {clf.__class__.__name__}")
    print("Matriz de Confusión:")
    print(cm)

    # Calcular y mostrar la precisión, recall y F1-score
    precision = precision_score(Y_train, Y_pred_cv)
    recall = recall_score(Y_train, Y_pred_cv)
    f1 = f1_score(Y_train, Y_pred_cv)

    print(f"Precisión: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print("="*40)

Clasificador: KNeighborsClassifier
Matriz de Confusión:
[[594 197]
 [313 509]]
Precisión: 0.7209631728045326
Recall: 0.6192214111922141
F1-score: 0.6662303664921465
Clasificador: SVC
Matriz de Confusión:
[[624 167]
 [257 565]]
Precisión: 0.7718579234972678
Recall: 0.6873479318734793
F1-score: 0.7271557271557273
Clasificador: DecisionTreeClassifier
Matriz de Confusión:
[[534 257]
 [264 558]]
Precisión: 0.6846625766871166
Recall: 0.6788321167883211
F1-score: 0.681734880879658
Clasificador: GaussianNB
Matriz de Confusión:
[[458 333]
 [236 586]]
Precisión: 0.6376496191512514
Recall: 0.7128953771289538
F1-score: 0.6731763354394027
Clasificador: RandomForestClassifier
Matriz de Confusión:
[[602 189]
 [197 625]]
Precisión: 0.7678132678132679
Recall: 0.7603406326034063
F1-score: 0.7640586797066016
Clasificador: VotingClassifier
Matriz de Confusión:
[[619 172]
 [231 591]]
Precisión: 0.7745740498034076
Recall: 0.718978102189781
F1-score: 0.7457413249211357


In [14]:
# Definir los espacios de búsqueda de hiperparámetros para cada clasificador
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Grid Search
grid_search_knn = GridSearchCV(knn_classifier, param_grid_knn, cv=5, scoring='accuracy')
grid_search_svm = GridSearchCV(svm_classifier, param_grid_svm, cv=5, scoring='accuracy')
grid_search_dt = GridSearchCV(dt_classifier, param_grid_dt, cv=5, scoring='accuracy')
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')

# Realizar la búsqueda de hiperparámetros
grid_search_knn.fit(X_train_scaled, Y_train)
grid_search_svm.fit(X_train_scaled, Y_train)
grid_search_dt.fit(X_train_scaled, Y_train)
grid_search_rf.fit(X_train_scaled, Y_train)

# Imprimir los mejores parámetros para cada modelo
print("Mejores parámetros para KNeighborsClassifier:", grid_search_knn.best_params_)
print("Mejores parámetros para SVC:", grid_search_svm.best_params_)
print("Mejores parámetros para DecisionTreeClassifier:", grid_search_dt.best_params_)
print("Mejores parámetros para RandomForestClassifier:", grid_search_rf.best_params_)

# Random Search
random_search_knn = RandomizedSearchCV(knn_classifier, param_distributions=param_grid_knn, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search_svm = RandomizedSearchCV(svm_classifier, param_distributions=param_grid_svm, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search_dt = RandomizedSearchCV(dt_classifier, param_distributions=param_grid_dt, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search_rf = RandomizedSearchCV(rf_classifier, param_distributions=param_grid_rf, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# Realizar la búsqueda de hiperparámetros
random_search_knn.fit(X_train_scaled, Y_train)
random_search_svm.fit(X_train_scaled, Y_train)
random_search_dt.fit(X_train_scaled, Y_train)
random_search_rf.fit(X_train_scaled, Y_train)

# Imprimir los mejores parámetros para cada modelo
print("Mejores parámetros para KNeighborsClassifier (Random Search):", random_search_knn.best_params_)
print("Mejores parámetros para SVC (Random Search):", random_search_svm.best_params_)
print("Mejores parámetros para DecisionTreeClassifier (Random Search):", random_search_dt.best_params_)
print("Mejores parámetros para RandomForestClassifier (Random Search):", random_search_rf.best_params_)

Mejores parámetros para KNeighborsClassifier: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
Mejores parámetros para SVC: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Mejores parámetros para DecisionTreeClassifier: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Mejores parámetros para RandomForestClassifier: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}




Mejores parámetros para KNeighborsClassifier (Random Search): {'weights': 'distance', 'p': 1, 'n_neighbors': 7}
Mejores parámetros para SVC (Random Search): {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
Mejores parámetros para DecisionTreeClassifier (Random Search): {'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}
Mejores parámetros para RandomForestClassifier (Random Search): {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}


In [15]:

def train_and_evaluate(model, model_name, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)

    print("="*40)
    print(f"\nModelo: {model_name}")
    print("Matriz de Confusión:")
    print(confusion_matrix(Y_test, Y_pred))
    print("Precisión:", accuracy_score(Y_test, Y_pred))
    print("Recall:", recall_score(Y_test, Y_pred))
    print("F1-score:", f1_score(Y_test, Y_pred))

# Modelos con los mejores parámetros obtenidos de Grid Search
knn_model_grid = KNeighborsClassifier(**grid_search_knn.best_params_)
svm_model_grid = SVC(**grid_search_svm.best_params_)
dt_model_grid = DecisionTreeClassifier(**grid_search_dt.best_params_)
rf_model_grid = RandomForestClassifier(**grid_search_rf.best_params_)

# Modelos con los mejores parámetros obtenidos de Random Search
knn_model_random = KNeighborsClassifier(**random_search_knn.best_params_)
svm_model_random = SVC(**random_search_svm.best_params_)
dt_model_random = DecisionTreeClassifier(**random_search_dt.best_params_)
rf_model_random = RandomForestClassifier(**random_search_rf.best_params_)

# Inicializar y entrenar modelos con los mejores parámetros obtenidos de Grid Search
train_and_evaluate(knn_model_grid, 'KNeighborsClassifier (Grid)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(svm_model_grid, 'SVC (Grid)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(dt_model_grid, 'DecisionTreeClassifier (Grid)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(rf_model_grid, 'RandomForestClassifier (Grid)', X_train_scaled, Y_train, X_test_scaled, Y_test)

# Inicializar y entrenar modelos con los mejores parámetros obtenidos de Random Search
train_and_evaluate(knn_model_random, 'KNeighborsClassifier (Random)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(svm_model_random, 'SVC (Random)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(dt_model_random, 'DecisionTreeClassifier (Random)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(rf_model_random, 'RandomForestClassifier (Random)', X_train_scaled, Y_train, X_test_scaled, Y_test)

# Modelos de votación
voting_model_grid = VotingClassifier(estimators=[
    ('knn', knn_model_grid),
    ('svm', svm_model_grid),
    ('dt', dt_model_grid),
    ('rf', rf_model_grid)
], voting='hard')

voting_model_random = VotingClassifier(estimators=[
    ('knn', knn_model_random),
    ('svm', svm_model_random),
    ('dt', dt_model_random),
    ('rf', rf_model_random)
], voting='hard')

# Entrenar modelos de votación
train_and_evaluate(voting_model_grid, 'VotingClassifier (Grid)', X_train_scaled, Y_train, X_test_scaled, Y_test)
train_and_evaluate(voting_model_random, 'VotingClassifier (Random)', X_train_scaled, Y_train, X_test_scaled, Y_test)


Modelo: KNeighborsClassifier (Grid)
Matriz de Confusión:
[[172  34]
 [ 76 122]]
Precisión: 0.7277227722772277
Recall: 0.6161616161616161
F1-score: 0.6892655367231638

Modelo: SVC (Grid)
Matriz de Confusión:
[[164  42]
 [ 64 134]]
Precisión: 0.7376237623762376
Recall: 0.6767676767676768
F1-score: 0.7165775401069518

Modelo: DecisionTreeClassifier (Grid)
Matriz de Confusión:
[[153  53]
 [ 56 142]]
Precisión: 0.7301980198019802
Recall: 0.7171717171717171
F1-score: 0.7226463104325699

Modelo: RandomForestClassifier (Grid)
Matriz de Confusión:
[[156  50]
 [ 40 158]]
Precisión: 0.7772277227722773
Recall: 0.797979797979798
F1-score: 0.7783251231527094

Modelo: KNeighborsClassifier (Random)
Matriz de Confusión:
[[172  34]
 [ 76 122]]
Precisión: 0.7277227722772277
Recall: 0.6161616161616161
F1-score: 0.6892655367231638

Modelo: SVC (Random)
Matriz de Confusión:
[[164  42]
 [ 64 134]]
Precisión: 0.7376237623762376
Recall: 0.6767676767676768
F1-score: 0.7165775401069518

Modelo: DecisionTreeClas