## Importemos las cosas importantes!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Carguemos los datos!
df = pd.read_csv("data.csv")

¡Ahora, saquemos de los datos que no nos importa (¡pero dejamos el identificador!) y lo filtraremos por año también porque crearemos un clasificador de los años 80!

In [3]:
drop_atributos = ["name", "artists", "release_date"]
df.drop(drop_atributos, inplace=True, axis=1)
df = df[(1980<=df.year) & (df.year<=1989)]
y = df["year"]; # nuestra variable a clasificar
X = df[[name for name in df.columns if name not in ["year","id"]]] # el conjunto de datos

Ahora vamos a entrenar a nuestro clasificador con el `DecisionTreeClassifier` con todos los atributos y veremos que sale por el momento!

In [4]:
#Ahora vamos a entrenar a nuestro clasificador con todos los atributos
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=37, stratify=y)

clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)    ## Entrenamos con features X_train y clases y_train

y_pred = clf.predict(X_test)   ## Predecimos con nuevos datos (los de test X_test)

print("Accuracy en test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))## Evaluamos la predicción comparando y_test con y_pred



Accuracy en test set: 0.1435
              precision    recall  f1-score   support

        1980       0.15      0.15      0.15       600
        1981       0.13      0.13      0.13       600
        1982       0.16      0.16      0.16       600
        1983       0.14      0.14      0.14       600
        1984       0.12      0.13      0.13       600
        1985       0.14      0.13      0.14       600
        1986       0.16      0.16      0.16       600
        1987       0.16      0.16      0.16       600
        1988       0.16      0.16      0.16       600
        1989       0.11      0.11      0.11       600

    accuracy                           0.14      6000
   macro avg       0.14      0.14      0.14      6000
weighted avg       0.14      0.14      0.14      6000



Podemos ver que no da un resultado tan bueno, obteniendo que los avg son de 0.14 para todas las metricas

Ahora, sabemos de la exploracion que `energy`, `acousticness` y `popularity` tienen una correlacion con los años, entonces veamos que sucede si solo entrenamos al clasificador con estos parametros 

In [5]:
#metodo para normalizar
def f(x):
    return x/(max(x)-min(x))
X["popularity"]=f(X["popularity"])

X_new = X[["energy","acousticness","popularity"]]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=.40, random_state=37, stratify=y)



clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)    ## Entrenamos con features X_train y clases y_train

y_pred = clf.predict(X_test)   ## Predecimos con nuevos datos (los de test X_test)

print("Accuracy en test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred)) ## Evaluamos la predicción comparando y_test con y_pred

Accuracy en test set: 0.115375
              precision    recall  f1-score   support

        1980       0.13      0.12      0.13       800
        1981       0.10      0.11      0.11       800
        1982       0.12      0.13      0.13       800
        1983       0.12      0.12      0.12       800
        1984       0.10      0.10      0.10       800
        1985       0.11      0.12      0.11       800
        1986       0.14      0.12      0.13       800
        1987       0.11      0.12      0.11       800
        1988       0.12      0.12      0.12       800
        1989       0.10      0.10      0.10       800

    accuracy                           0.12      8000
   macro avg       0.12      0.12      0.12      8000
weighted avg       0.12      0.12      0.12      8000



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["popularity"]=f(X["popularity"])


Podemos ver que en general la clasificación no muestra buenos resultados, por lo que veremos si es posible obtener mejores resultados a partir de clasificar, con columnas que tengan mayor correlación.

En particular con las propiedades de acousticness y popularity.

In [6]:
columnas = ['acousticness','popularity']
X_new = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=.40, random_state=37, stratify=y)

clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)    ## Entrenamos con features X_train y clases y_train

y_pred = clf.predict(X_test)   ## Predecimos con nuevos datos (los de test X_test)

print("Accuracy en test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred)) ## Evaluamos la predicción comparando y_test con y_pred

Accuracy en test set: 0.112125
              precision    recall  f1-score   support

        1980       0.11      0.12      0.12       800
        1981       0.10      0.11      0.11       800
        1982       0.10      0.11      0.10       800
        1983       0.13      0.12      0.12       800
        1984       0.10      0.10      0.10       800
        1985       0.12      0.12      0.12       800
        1986       0.12      0.11      0.12       800
        1987       0.12      0.12      0.12       800
        1988       0.11      0.10      0.11       800
        1989       0.11      0.10      0.11       800

    accuracy                           0.11      8000
   macro avg       0.11      0.11      0.11      8000
weighted avg       0.11      0.11      0.11      8000



A continuación veremos los resultados de los parámetros de validación, pero con resultados en promedio.

In [7]:
from sklearn.model_selection import cross_validate

scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']
cv_results = cross_validate(
    clf, X, y, cv=10, scoring=scoring, return_train_score=True)

print('Promedio Precision:', np.mean(cv_results['test_precision_macro']))
print('Promedio Recall:', np.mean(cv_results['test_recall_macro']))
print('Promedio F1-score:', np.mean(cv_results['test_f1_macro']))
print('Promedio Accucary:', np.mean(cv_results['test_accuracy']))

Promedio Precision: 0.14564017444746372
Promedio Recall: 0.14579999999999999
Promedio F1-score: 0.14461047992421291
Promedio Accucary: 0.14579999999999999


In [8]:
from sklearn.model_selection import cross_validate

scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']
cv_results = cross_validate(
    clf, X_new, y, cv=10, scoring=scoring, return_train_score=True)

print('Promedio Precision:', np.mean(cv_results['test_precision_macro']))
print('Promedio Recall:', np.mean(cv_results['test_recall_macro']))
print('Promedio F1-score:', np.mean(cv_results['test_f1_macro']))
print('Promedio Accucary:', np.mean(cv_results['test_accuracy']))

Promedio Precision: 0.10353676356034973
Promedio Recall: 0.10195000000000001
Promedio F1-score: 0.1012522774557529
Promedio Accucary: 0.10194999999999999


In [9]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
def run_classifier(clf, X, y, num_tests=100):
    metrics = {'f1-score': [], 'precision': [], 'recall': []}
    
    for _ in range(num_tests):
        #dividiendo los datos de entrenamiento y validación
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30)
        
        ### INICIO COMPLETAR ACÁ 
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        #### TIP: en base a los set de entrenamiento, genere la variable 'predictions' 
        #### que contiene las predicciones del modelo
        
       
        
        ### FIN COMPLETAR ACÁ
        
        metrics['y_pred'] = predictions
        metrics['f1-score'].append(f1_score(y_test, predictions, average='micro')) 
        metrics['recall'].append(recall_score(y_test, predictions, average='micro'))
        metrics['precision'].append(precision_score(y_test, predictions, average='micro'))
    return metrics

A continuación utilizaremos otros clasificador, para ver si podemos obtener mejores resultados con nuestro database.

In [18]:
from sklearn.naive_bayes import GaussianNB  # naive bayes
from sklearn.neighbors import KNeighborsClassifier #kNN
from sklearn.svm import SVC  # support vector machine

c0 = ("Gaussian Naive Bayes", GaussianNB())
c1 = ("KNN", KNeighborsClassifier(n_neighbors=10))
#c2 = ("Support Vector Machines", SVC())

classifiers = [c0, c1,]# c2]

results = {}

for name, clf in classifiers:
    metrics = run_classifier(clf, X_new, y)   # hay que implementarla en el bloque anterior.
    results[name] = metrics
    print("----------------")
    print("Resultados para clasificador: ", name) 
    print("Precision promedio:", np.array(metrics['precision']).mean())
    print("Recall promedio:", np.array(metrics['recall']).mean())
    print("F1-score promedio:", np.array(metrics['f1-score']).mean())
    print("")
    print("----------------\n\n")  

----------------
Resultados para clasificador:  Gaussian Naive Bayes
Precision promedio: 0.11316166666666665
Recall promedio: 0.11316166666666665
F1-score promedio: 0.11316166666666665

----------------


----------------
Resultados para clasificador:  KNN
Precision promedio: 0.10951666666666666
Recall promedio: 0.10951666666666666
F1-score promedio: 0.10951666666666665

----------------


