# Escalado de variables

## Cómo importar las librerías


In [145]:
import numpy as np
import pandas as pd

# Establecer opciones de impresión para NumPy
np.set_printoptions(precision=3, suppress=True, formatter={'float_kind': '{:0.3f}'.format})

## Importar el data set


In [146]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values


## Codificar datos categóricos

In [148]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, 1:3] = imputer.fit_transform(x[:, 1:3])
labelEncoder_X = LabelEncoder()
x[:, 0] = labelEncoder_X.fit_transform(x[:, 0])
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)
x = np.array(ct.fit_transform(x), dtype=np.float16)
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)


display(x)

array([[0.000, 1.000, 0.000, 0.000, 44.000, inf],
       [1.000, 0.000, 0.000, 1.000, 27.000, 48000.000],
       [1.000, 0.000, 1.000, 0.000, 30.000, 54016.000],
       [1.000, 0.000, 0.000, 1.000, 38.000, 60992.000],
       [1.000, 0.000, 1.000, 0.000, 40.000, 63776.000],
       [0.000, 1.000, 0.000, 0.000, 35.000, 57984.000],
       [1.000, 0.000, 0.000, 1.000, 38.781, 52000.000],
       [0.000, 1.000, 0.000, 0.000, 48.000, inf],
       [1.000, 0.000, 1.000, 0.000, 50.000, inf],
       [0.000, 1.000, 0.000, 0.000, 37.000, inf]], dtype=float16)

## Escalado de variables

In [149]:
from sklearn.preprocessing import StandardScaler
subset = x[:, 3:5]
scaler = StandardScaler()
scaler.fit(subset)
scaled_subset = scaler.transform(subset)
x[:, 3:5] = scaled_subset

print("original:")
display(x)


original:


array([[0.000, 1.000, 0.000, -0.655, 0.759, inf],
       [1.000, 0.000, 0.000, 1.528, -1.712, 48000.000],
       [1.000, 0.000, 1.000, -0.655, -1.276, 54016.000],
       [1.000, 0.000, 0.000, 1.528, -0.113, 60992.000],
       [1.000, 0.000, 1.000, -0.655, 0.177, 63776.000],
       [0.000, 1.000, 0.000, -0.655, -0.549, 57984.000],
       [1.000, 0.000, 0.000, 1.528, 0.000, 52000.000],
       [0.000, 1.000, 0.000, -0.655, 1.340, inf],
       [1.000, 0.000, 1.000, -0.655, 1.630, inf],
       [0.000, 1.000, 0.000, -0.655, -0.258, inf]], dtype=float16)

## Dividir el data set 
En conjunto de entrenamiento y conjunto de testing

In [150]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
display(x_train)
display(x_test)

array([[1.000, 0.000, 1.000, -0.655, 0.177, 63776.000],
       [0.000, 1.000, 0.000, -0.655, -0.258, inf],
       [1.000, 0.000, 0.000, 1.528, -1.712, 48000.000],
       [1.000, 0.000, 0.000, 1.528, 0.000, 52000.000],
       [0.000, 1.000, 0.000, -0.655, 1.340, inf],
       [1.000, 0.000, 0.000, 1.528, -0.113, 60992.000],
       [0.000, 1.000, 0.000, -0.655, 0.759, inf],
       [0.000, 1.000, 0.000, -0.655, -0.549, 57984.000]], dtype=float16)

array([[1.000, 0.000, 1.000, -0.655, -1.276, 54016.000],
       [1.000, 0.000, 1.000, -0.655, 1.630, inf]], dtype=float16)