In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score, KFold
from math import sqrt
from pprint import pprint
from sklearn import datasets, linear_model, metrics
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import make_scorer, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler

# Ejemplo tonto KFold CV

Init signature: KFold(n_splits=5, *, shuffle=False, random_state=None)


Provides train/test indices to split data in train/test sets. Split
dataset into k consecutive folds (without shuffling by default).

Each fold is then used once as a validation while the k - 1 remaining
folds form the training set.


Parameters
----------
- **n_splits** : int, default=5
    Number of folds. Must be at least 2.




- **shuffle** : bool, default=False
    Whether to shuffle the data before splitting into batches.
    Note that the samples within each split will not be shuffled.
    
    

- **random_state** : int, RandomState instance or None, default=None
    When `shuffle` is True, `random_state` affects the ordering of the
    indices, which controls the randomness of each fold. Otherwise, this
    parameter has no effect.
    Pass an int for reproducible output across multiple function calls.

In [2]:
# Carga de datos.
X = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]

kf = KFold(n_splits = 5, shuffle=True) # Shuffle permite aleatorizar las bolsas de test vez de hacer una lectura por orden 
bolsas = kf.split(X)
print(bolsas)

<generator object _BaseKFold.split at 0x00000223F62CC9E0>


In [3]:
k = 1
for train, test in bolsas:
    print("Iteracion", k, ":")
    print(" - Entrenamiento: %s" % (train)) 
    print(" - Test: %s" % (test))
    k = k + 1

Iteracion 1 :
 - Entrenamiento: [0 1 3 4 6 7 8 9]
 - Test: [2 5]
Iteracion 2 :
 - Entrenamiento: [1 2 3 4 5 6 7 9]
 - Test: [0 8]
Iteracion 3 :
 - Entrenamiento: [0 1 2 5 6 7 8 9]
 - Test: [3 4]
Iteracion 4 :
 - Entrenamiento: [0 2 3 4 5 6 7 8]
 - Test: [1 9]
Iteracion 5 :
 - Entrenamiento: [0 1 2 3 4 5 8 9]
 - Test: [6 7]


# REGRESIÓN
## cargando el dataset

In [4]:
path = 'C:/Users/plane/OneDrive/Escritorio/COMPUTING SCIENCE/MASTER Inteligencia artificial/05_Aprendizaje_supervisado/data/BostonHousing.csv'

data = pd.read_csv(path, sep= ',')

data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## separando variables dependientes e independientes

In [5]:
X = data.drop('medv', axis = 1)
y = data['medv']

print(X.shape)
print(y.shape)

(506, 13)
(506,)


## Hold out split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, shuffle= True, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(404, 13)
(404,)
(102, 13)
(102,)


## escalando los datos

In [7]:
scl = StandardScaler()
X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_train)



## Cross validate

Cross validate devuelve un diccionario con:

- fit_time: el tiempo en tarda en entrenar cada uno de los modelos key = fit_time, value= un array con el valor de cada fold
- score_time: el tiempo que tarda en dar el score que nosotros le demos. key = score_time, value= un array con el valor de cada fold
- test_score: los resultados de las metricas que le hemos dado en el argumento scoring. key = nombre de la score, value= un array con los valores de cada metrica para cada fold


- **ACEPTA VARIAS MÉTRICAS**
- **DEVUELVE UN DICCIONARIO** con las keys nombre de la metrica y value un array con los valores obtenidos de cada fold

In [8]:
reg = linear_model.LinearRegression(fit_intercept= True)

In [9]:
results = cross_validate(estimator= reg,
                         X= X_train, y= y_train,
                         cv= KFold(n_splits= 5, shuffle= True, random_state= 42),
                         scoring= {'r2': 'r2', 'RMSE': 'neg_root_mean_squared_error'})
print(type(results))

<class 'dict'>


In [10]:
for metric, result in results.items():
    print(f'Los resultados obtenidos para la métrica  {metric}, en promedio es: \t--->\t {result.mean()}')

Los resultados obtenidos para la métrica  fit_time, en promedio es: 	--->	 0.0015951633453369141
Los resultados obtenidos para la métrica  score_time, en promedio es: 	--->	 0.0009975433349609375
Los resultados obtenidos para la métrica  test_r2, en promedio es: 	--->	 0.7184784187192004
Los resultados obtenidos para la métrica  test_RMSE, en promedio es: 	--->	 -4.866279636267083


## Cross_val_score

Cross_val_score es un metodo mas simple que el anterior pues unicamente devuelve un array con los scores de la métrica que le hemos dado. 

- **SOLO ACEPTA UNA MÉTRICA**
- **DEVUELVE UN ARRAY** con los valores de cada fold

In [11]:
res = cross_val_score(estimator= reg, 
                      X= X_train, y= y_train,
                      cv= KFold(n_splits= 5, shuffle= True, random_state= 42),
                      scoring= 'r2')

print(type(res))

<class 'numpy.ndarray'>


In [12]:
print(res)

[0.6724828  0.76007223 0.58945002 0.75917498 0.81121207]


## Cross_val_predict

Esta funcion devuelve un array con todas las predicciones realizadas en cada fold con el conjunto de datos de test, devolverá tantas prediccionens como instancias le pasemos.



In [13]:
predictions = cross_val_predict(estimator=reg,                                              # el modelo
                                X=X_train, y= y_train,                                      # datos de entrenamiento
                                cv= KFold(n_splits= 5, shuffle= True, random_state= 42),    # datos de cv
                                method= 'predict')                                          # el metodo, se le puede pedir predecir probabilidad predict_proba, decision_function, etc

print(type(predictions))

<class 'numpy.ndarray'>


In [14]:
print(y_train.shape)
print(predictions.shape)

(404,)
(404,)


In [15]:
print(predictions[:10])             # las primeras 10 predicciones

[11.00535883 19.26968635 23.31876459 11.57976829 18.39189108 24.62194264
 21.06343467 23.78549695  7.81642776 20.23608684]


# CLASIFICACIÓN

## cargado el dataset

In [16]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score

In [17]:
iris = datasets.load_iris()

In [18]:
X = iris.data
y = iris.target

print(X.shape)
print(y.shape)

(150, 4)
(150,)


## separando test y train

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle= True, test_size= 0.2, random_state=42)

## escalando los datos

In [20]:
scl = StandardScaler()

X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

## el modelo

In [31]:
from sklearn.svm import SVC

In [32]:
clas = SVC(C=0.1, gamma='auto', kernel='rbf')

## Cross_val_score

In [37]:
clasification_results = cross_val_score(estimator= clas,
                                        X= X_train, y= y_train,
                                        scoring= 'f1_macro',
                                        cv= KFold(n_splits= 5, shuffle= True, random_state= 42),
                                        n_jobs= -1)

print(type(clasification_results))
print(clasification_results)

<class 'numpy.ndarray'>
[1.         0.88854489 0.77777778 0.76388889 0.96023392]


## cross_validate

In [42]:
clas_validate = cross_validate(estimator= clas,
                                        X= X_train, y= y_train,
                                        scoring= {'f1': 'f1_macro', 'accuracy':'accuracy'},
                                        cv= KFold(n_splits= 5, shuffle= True, random_state= 42),
                                        n_jobs= -1)

print(type(clas_validate))
for metric, result in clas_validate.items():
    print(f'Los resultados obtenidos para la métrica  {metric}, en promedio es: \t \t ---> \t {result.mean()}')

<class 'dict'>
Los resultados obtenidos para la métrica  fit_time, en promedio es: 	 	 ---> 	 0.001990222930908203
Los resultados obtenidos para la métrica  score_time, en promedio es: 	 	 ---> 	 0.002991056442260742
Los resultados obtenidos para la métrica  test_f1, en promedio es: 	 	 ---> 	 0.8780890952872376
Los resultados obtenidos para la métrica  test_accuracy, en promedio es: 	 	 ---> 	 0.8833333333333334


## cross_val_predict

In [44]:
y_pred = cross_val_predict(clas,
                           X_train, y_train,
                           cv= KFold(n_splits=5, shuffle= True, random_state=42))
print(y_pred)  # las predicciones

[0 0 1 0 0 1 1 0 0 0 2 1 1 0 0 1 1 2 1 2 1 2 1 0 2 1 0 0 0 1 1 0 0 0 1 0 1
 2 0 1 2 0 1 1 1 1 2 1 0 1 2 0 0 1 2 0 2 0 0 1 1 2 1 1 2 1 0 0 1 2 0 0 0 1
 2 0 2 2 0 1 1 1 2 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 2 2 2 1 2 1
 1 1 1 0 1 1 0 1 2]
