In [17]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np

urlTrain  = 'https://raw.githubusercontent.com/CyberJuan55/Proyecto-IA/master/DataSet/SpotifyFeatures_train.csv'
urlFull = 'https://raw.githubusercontent.com/CyberJuan55/Proyecto-IA/master/DataSet/SpotifyFeatures.csv'
# el train que nos dieron los profes
#df_train_original= pd.read_csv('/content/SpotifyFeatures_train.csv')
df_train_original = pd.read_csv(urlTrain)
df_train = df_train_original.copy()
# el dataset nuestro
df_full_original = pd.read_csv(urlFull)
df_full = df_full_original.copy()


# Exploracion de datos

In [18]:
df_train.dtypes

Unnamed: 0            int64
genre                object
artist_name          object
track_name           object
track_id             object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

In [19]:
df_train['genre'].value_counts()

Comedy              7771
Soundtrack          7758
Jazz                7634
Indie               7603
Children’s Music    7554
Pop                 7532
Electronic          7500
Rock                7455
Hip-Hop             7422
Folk                7406
Classical           7395
Rap                 7367
Alternative         7342
Soul                7280
R&B                 7242
World               7211
Blues               7187
Ska                 7154
Reggaeton           7139
Anime               7131
Reggae              7019
Dance               6950
Country             6861
Opera               6612
Movie               6261
Children's Music    4298
A Capella             96
Name: genre, dtype: int64

Vemos que la columna Children esta repetida

# Variables Categoricas

In [20]:
# Variable categoricas
s = (df_train.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

['genre', 'artist_name', 'track_name', 'track_id', 'key', 'mode', 'time_signature']


In [21]:
list_of_timeSignature = df_train['time_signature'].unique()
print(list_of_timeSignature)

['4/4' '3/4' '5/4' '1/4' '0/4']


In [22]:
df_train['time_signature'].value_counts()

4/4    160645
3/4     19247
5/4      4198
1/4      2083
0/4         7
Name: time_signature, dtype: int64

Decidimos no utilizar Time_signature como feature por varios motivos


*   En musica no existe compaces de 1/4 ni de 0/4 entre ambos suman 1656 valores lo que nos parecio un numero considerable
*   En una cancion puede haber multiples cambios de compaces, en la musica progresiva es algo muy comun

*   La mayoria de la musica se encuentra en 4/4 podemos ver que hay 156851 valores que considerablemente mayor a lo suma de todas las otras categorias esto introduceria un gran sesgo en nuestro modelo 










In [23]:
# variables categoricas con baja cardinalidad
categorical_cols = [cname for cname in df_train.columns if
                    df_train[cname].nunique() < 14 and 
                    df_train[cname].dtype == "object"]
print(categorical_cols)

['key', 'mode', 'time_signature']


Quedan descartadas las variables categoricas: 

*   Time_signature por los motivos comentados anteriormente
*   track_id: no hay ninguna razon para utilizarla
*   el nombre del artista: son demasiadas variables de momento las sacamos pero es cierto que es un feature interesente 
*   track_name: todos sus valores son distintos 







In [24]:
list_of_modes = df_train['mode'].unique()
print(list_of_modes)

['Major' 'Minor']


In [25]:
list_of_keys = df_train['key'].unique()
print(list_of_keys)


['B' 'A#' 'C#' 'C' 'F#' 'E' 'A' 'D' 'G' 'F' 'G#' 'D#']


### Preprocesado de datos

In [26]:
# Funcion para el preprocesamiento de datos


def preprocesado(df):

    

    if df.duplicated().sum() == 0:
        print('no hay datos duplicados')
    else:
        df.drop_duplicates


    if 'Unnamed: 0' in df.columns:
        df = df.drop(['Unnamed: 0'], axis=1)
    else:
        print('no esta')  
    df = df.drop_duplicates(subset=['track_name','artist_name'],keep=False)
    df = df.drop(['time_signature','track_id','artist_name','track_name'], axis=1)
    df['genre']= df['genre'].replace(["Children’s Music"],"Children's Music")
    indexNamesChildren = df[ df['genre'] == "Children's Music" ].index
    df.drop(indexNamesChildren , inplace=True)
    
    cols_with_missing = [col for col in df.columns if df[col].isnull().any()] 
    faltanDatos = True
    if len(cols_with_missing)==0:
        faltanDatos= False
        print('no faltan valores') 
    if faltanDatos == True:
        print(cols_with_missing)
        df.dropna(axis=0, inplace=True)

    X = df.drop(columns=['genre'])
    y = df['genre']
    

    return X,y 


In [27]:
df_train,y = preprocesado(df_train)

no hay datos duplicados
no faltan valores


In [28]:
df_train_num = df_train.drop(["key",'mode'], axis=1)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])
cat_attribs1 = ["key"]
cat_attribs2 = ["mode"]
num_attribs = list(df_train_num)

full_pipeline = ColumnTransformer([
        ('numerical', num_pipeline, num_attribs),                          
        ("cat", OneHotEncoder(drop='first'), cat_attribs1),
        ("cat1", OrdinalEncoder(), cat_attribs2)
    ])


In [30]:
df_train_processed = full_pipeline.fit_transform(df_train)

In [31]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
X_train_full, X_valid_full, y_train, y_valid = train_test_split(df_train_processed, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=42)

In [32]:
lg=LogisticRegression(max_iter=2000, random_state=1, penalty = 'l2', C = 0.01) #onevsrestclassifier
lg.fit(X_train_full, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
y_train_pred = lg.predict(X_valid_full)


In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_valid, y_train_pred))

0.4665081081081081


In [35]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=30, random_state=42)
rfc.fit(X_train_full, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [37]:
predict = rfc.predict(X_valid_full)
print(accuracy_score(y_valid, predict))

0.5494054054054054


In [40]:
# Numero de arboles
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 8)]
# Numero de features considerado al dividir
max_features = ['auto', 'sqrt']
# numero maximo de niveles
max_depth = [2,4]
# Numero minimo de niveles por nodo
min_samples_split = [2, 5]
# Numero minimo de niveles por hoja
min_samples_leaf = [1, 2]
# metodo de seleccion por arbol 
bootstrap = [True, False]

In [41]:
print(n_estimators)

[10, 20, 30, 40, 50, 60, 70, 80]


In [42]:
# Grilla
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [43]:
from sklearn.model_selection import GridSearchCV
rfc_Grid = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)

In [None]:
rfc_Grid.fit(X_train_full, y_train)


Fitting 3 folds for each of 256 candidates, totalling 768 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   47.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  4.0min
