# Pair Programming Métricas

En el pair programming anterior creastéis vuestro primer modelo de machine learning usando la regresion Lineal. Es el momento, que con vuestros datos evaluéis si es bueno haciendo predicciones. Los objetivo de este pairprogramming son:

In [11]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
#  Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
#  Crossvalidation
# ------------------------------------------------------------------------------
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics

In [12]:
#abrimos los csv donde hemos guardado los df de entrenamiento, test y resultados de los residuos:
df_train= pd.read_csv('datos/datos_train.csv')

df_test = pd.read_csv('datos/datos_test.csv')

resultados = pd.read_csv('datos/resultados-RL.csv')

In [13]:

df = pd.read_csv('datos/diamonds_def.csv', index_col=0)

### 1. Calculéis las métricas para vuestro modelo


In [14]:
#vamos a comprobar primero qué análisis podemos utilizar
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weig

In [15]:
#separamos la VR del df:
X = df.drop("est_carat", axis = 1)
y = df["est_carat"]

In [16]:
#Comprobamos que la variable respuesta ha sido eliminida
X.head()

Unnamed: 0,est_depth,est_table,est_price,est_lenght_mm,est_width_mm,est_depth_mm,cut_encoded,clarity_encoded,color_encoded
1,-1.597233,1.661056,-0.986397,-1.646799,-1.705394,-1.780396,5,3,2
2,0.042616,3.519383,-0.986075,-1.503737,-1.498268,-1.780396,4,1,2
3,0.534571,0.26731,-0.98382,-1.369617,-1.35418,-1.316852,2,4,2
4,1.272504,0.26731,-0.983498,-1.244438,-1.246114,-1.143024,4,2,6
5,0.862541,-0.197272,-0.983175,-1.602092,-1.597328,-1.534138,2,3,7


In [17]:
y.head()

1   -1.268160
2   -1.224362
3   -1.092965
4   -1.049167
5   -1.202462
Name: est_carat, dtype: float64

In [18]:
#Separamos los datos de entrenamiento y los datos de prueba 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [19]:
#creamos el método con la regresión lineal
lr = LinearRegression()
lr.fit(x_train, y_train)

In [20]:
# hacemos las predicciones para el modelo:
y_predict_train = lr.predict(x_train) 
y_predict_test = lr.predict(x_test)

In [21]:
#almacenamos en un df para observar las diferencias entre lo observado y lo esperado:
df_train = pd.DataFrame({'Real': y_train, 'Predicted': y_predict_train, 'Set': ['Train']*len(y_train)})
df_test  = pd.DataFrame({'Real': y_test,  'Predicted': y_predict_test,  'Set': ['Test']*len(y_test)})
resultados = pd.concat([df_train,df_test], axis = 0)
resultados.head()

Unnamed: 0,Real,Predicted,Set
8427,0.045802,0.246916,Train
47965,-0.633079,-0.564571,Train
49194,-0.479783,-0.373481,Train
26207,1.951048,1.563968,Train
23010,1.994846,1.700329,Train


In [22]:
#exploramos los residuos:
resultados['residuos'] = resultados['Real'] - resultados['Predicted']
resultados.head()

Unnamed: 0,Real,Predicted,Set,residuos
8427,0.045802,0.246916,Train,-0.201114
47965,-0.633079,-0.564571,Train,-0.068507
49194,-0.479783,-0.373481,Train,-0.106302
26207,1.951048,1.563968,Train,0.38708
23010,1.994846,1.700329,Train,0.294517


In [23]:
cv_scores = cross_validate(
                estimator = LinearRegression(),
                X         = X,
                y         = y,
                scoring   = ('r2', 'neg_root_mean_squared_error'), 
                cv        = 10) #nº de veces que realizamos la comprobación

#volcamos a un df para facilitar su legibilidad:
cv_scores = pd.DataFrame(cv_scores)
cv_scores #el valor no es realmente negativo, sino positivo

Unnamed: 0,fit_time,score_time,test_r2,test_neg_root_mean_squared_error
0,0.017345,0.001667,0.883891,-0.155599
1,0.008562,0.00113,0.936674,-0.13079
2,0.01012,0.001358,0.94498,-0.142904
3,0.008801,0.001285,0.923105,-0.219361
4,0.006562,0.001006,0.660501,-0.657631
5,0.007917,0.001078,0.849918,-0.444632
6,0.007545,0.001026,0.046646,-0.124515
7,0.010895,0.001055,0.786826,-0.093611
8,0.008042,0.000974,0.735276,-0.123779
9,0.00683,0.001745,0.763501,-0.159169


In [24]:
print(f'la media del R2 es = {round(cv_scores["test_r2"].mean(),2)}')

#usamos 'abs' para que nos devuelva el valor absoluto, es decir, que no aparezca negativo
print(f'la media del RMSE es = {round(abs(cv_scores["test_neg_root_mean_squared_error"].mean()),2)}')

la media del R2 es = 0.75
la media del RMSE es = 0.23


In [25]:
resultados_metricas = {'MAE': [mean_absolute_error(y_test, y_predict_test), mean_absolute_error(y_train, y_predict_train)],
                'MSE': [mean_squared_error(y_test, y_predict_test), mean_squared_error(y_train, y_predict_train)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_predict_test)), np.sqrt(mean_squared_error(y_train, y_predict_train))],
                'R2':  [r2_score(y_test, y_predict_test), r2_score(y_train, y_predict_train)],
                 "set": ["test", "train"], 
                 "modelo": ["Linear Regresion", "LinearRegression"]}

df_resultados = pd.DataFrame(resultados_metricas)

df_resultados

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.151985,0.074438,0.272832,0.925224,test,Linear Regresion
1,0.149743,0.071913,0.268167,0.928142,train,LinearRegression


In [30]:
# Para el conjunto de entrenamiento las méticas han sido
print('En TRAIN:')
print('- El valor de r2 score es',r2_score(y_train,y_predict_train))
print('- MAE =',mean_absolute_error(y_train,y_predict_train))
print('- MSE =',mean_squared_error(y_train,y_predict_train))
print('- RMSE =',np.sqrt(mean_squared_error(y_train,y_predict_train)))
# Para el conjunto de test las métricas han sido
print('EN TEST:')
print('- El valor de r2 score es',r2_score(y_test,y_predict_test))
print('- El MAE es',mean_absolute_error(y_test,y_predict_test))
print('- El MSE es',mean_squared_error(y_test,y_predict_test))
print('- EL RMSE es ',np.sqrt(mean_squared_error(y_test,y_predict_test)))

En TRAIN:
- El valor de r2 score es 0.928142480548804
- MAE = 0.14974321765071372
- MSE = 0.07191349454357572
- RMSE = 0.26816691545299864
EN TEST:
- El valor de r2 score es 0.9252242032727056
- El MAE es 0.15198477314797226
- El MSE es 0.07443755695049943
- EL RMSE es  0.2728324704841772


### 2. Discutid los resultados de las métricas y extraed conclusiones.


Observamos que los resultados obtenidos en R² son de 92%, significa que que todas nuestras variables predictoras explican un 92% de la variación que encontramos en la variable respuesta, y el 8% restante  de la variación no está explicado por ninguna de las variables que tenemos.

### 3. Guardad los resultados de las métricas en un csv para usarlo más adelante.

In [29]:
# vamos a guardar este csv para usarlo en próximas lecciones

df_resultados.to_csv("datos/diamonds_rdos_metricas.csv")