In [1]:
pip install pandas scikit-learn



Cargamos la librería necesaria

In [2]:
import pandas as pd
import sklearn as skl

Cargamos el fichero del dataset

In [3]:
url = "https://raw.githubusercontent.com/AprendeConEjemplos/aprendizaje-automatico-con-scikit-learn/main/04_Preprocesamiento/Stars.csv"
dataframe = pd.read_csv(url)
print(dataframe)

     Temperature              L          R    A_M  Color Spectral_Class  Type
0           3068       0.002400     0.1700  16.12    Red              M     0
1           3042       0.000500     0.1542  16.60    Red              M     0
2           2600       0.000300     0.1020  18.70    Red              M     0
3           2800       0.000200     0.1600  16.65    Red              M     0
4           1939       0.000138     0.1030  20.06    Red              M     0
..           ...            ...        ...    ...    ...            ...   ...
235        38940  374830.000000  1356.0000  -9.93   Blue              O     5
236        30839  834042.000000  1194.0000 -10.63   Blue              O     5
237         8829  537493.000000  1423.0000 -10.73  White              A     5
238         9235  404940.000000  1112.0000 -11.23  White              A     5
239        37882  294903.000000  1783.0000  -7.80   Blue              O     5

[240 rows x 7 columns]


In [4]:
print(dataframe.describe())

        Temperature              L            R         A_M        Type
count    240.000000     240.000000   240.000000  240.000000  240.000000
mean   10497.462500  107188.361635   237.157781    4.382396    2.500000
std     9552.425037  179432.244940   517.155763   10.532512    1.711394
min     1939.000000       0.000080     0.008400  -11.920000    0.000000
25%     3344.250000       0.000865     0.102750   -6.232500    1.000000
50%     5776.000000       0.070500     0.762500    8.313000    2.500000
75%    15055.500000  198050.000000    42.750000   13.697500    4.000000
max    40000.000000  849420.000000  1948.500000   20.060000    5.000000


In [5]:
dataset = dataframe.drop("Type", axis=1)
label = dataframe["Type"].copy()

In [6]:
dataframe_op1 = dataframe.dropna(subset=["Temperature"])    # Opción 1, eliminamos las instancias con valores nulos
dataframe_op2 = dataframe.drop("Temperature", axis=1)       # Opción 2, eliminamos el atributo que contiene valores nulos
mean_temp = dataframe["Temperature"].mean()
dataframe_op3 = dataframe["Temperature"].fillna(mean_temp)  # Opción 3, asignamos el valor medio en los valores nulos

Iniciamos el preprocesamiento de los atributos con valores de texto

In [7]:
color_cat = dataframe[['Color']]
spectral_cat = dataframe[['Spectral_Class']]
print(color_cat.head(10))
print(spectral_cat.head(10))

  Color
0   Red
1   Red
2   Red
3   Red
4   Red
5   Red
6   Red
7   Red
8   Red
9   Red
  Spectral_Class
0              M
1              M
2              M
3              M
4              M
5              M
6              M
7              M
8              M
9              M


Importamos la funcionalidad de Scikit-learn

In [8]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
color_cat_encoded = ordinal_encoder.fit_transform(color_cat)
print(color_cat_encoded[:10])

[[8.]
 [8.]
 [8.]
 [8.]
 [8.]
 [8.]
 [8.]
 [8.]
 [8.]
 [8.]]


In [9]:
print(ordinal_encoder.categories_)

[array(['Blue', 'Blue White', 'Blue white', 'Blue-White', 'Blue-white',
       'Orange', 'Orange-Red', 'Pale yellow orange', 'Red', 'White',
       'White-Yellow', 'Whitish', 'Yellowish', 'Yellowish White', 'white',
       'yellow-white', 'yellowish'], dtype=object)]


Importamos lo necesario para realizar el One Hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
color_cat_one_hot = one_hot_encoder.fit_transform(color_cat)
print(color_cat_one_hot)
print(color_cat_one_hot.toarray().shape)
print(color_cat_one_hot.toarray())

  (0, 8)	1.0
  (1, 8)	1.0
  (2, 8)	1.0
  (3, 8)	1.0
  (4, 8)	1.0
  (5, 8)	1.0
  (6, 8)	1.0
  (7, 8)	1.0
  (8, 8)	1.0
  (9, 8)	1.0
  (10, 8)	1.0
  (11, 8)	1.0
  (12, 8)	1.0
  (13, 8)	1.0
  (14, 8)	1.0
  (15, 8)	1.0
  (16, 8)	1.0
  (17, 8)	1.0
  (18, 8)	1.0
  (19, 8)	1.0
  (20, 1)	1.0
  (21, 9)	1.0
  (22, 9)	1.0
  (23, 9)	1.0
  (24, 1)	1.0
  :	:
  (215, 0)	1.0
  (216, 4)	1.0
  (217, 4)	1.0
  (218, 4)	1.0
  (219, 15)	1.0
  (220, 0)	1.0
  (221, 0)	1.0
  (222, 0)	1.0
  (223, 0)	1.0
  (224, 0)	1.0
  (225, 0)	1.0
  (226, 0)	1.0
  (227, 0)	1.0
  (228, 0)	1.0
  (229, 0)	1.0
  (230, 4)	1.0
  (231, 0)	1.0
  (232, 0)	1.0
  (233, 4)	1.0
  (234, 4)	1.0
  (235, 0)	1.0
  (236, 0)	1.0
  (237, 9)	1.0
  (238, 9)	1.0
  (239, 0)	1.0
(240, 17)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


Ejemplos de normalización de valores de atributos


In [11]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
l_values = dataframe[['L']]
scaled_values = min_max_scaler.fit(l_values)
print(min_max_scaler.transform(l_values)[0:10])

[[2.73127546e-09]
 [4.94455040e-10]
 [2.59000259e-10]
 [1.41272869e-10]
 [6.82818865e-11]
 [6.71046126e-10]
 [7.65228038e-10]
 [3.76727649e-10]
 [7.18137082e-10]
 [1.17727390e-10]]


Ejemplo de estandarización

In [12]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
l_values = dataframe[['L']]
scaled_values = standard_scaler.fit(l_values)
print(standard_scaler.transform(l_values)[0:10])

[[-0.59862356]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]
 [-0.59862357]]


Creación del pipeline completo

In [13]:
from sklearn.compose import ColumnTransformer
num_attrs = ["Temperature", "L", "R"]
text_attrs = ["Color", "Spectral_Class"]

pipeline = ColumnTransformer([
                              ("numeric", StandardScaler(), num_attrs),
                              ("text", OneHotEncoder(), text_attrs)
])
preprocessed_dataset = pipeline.fit_transform(dataset)

Creamos los conjutos de entrenamiento y test

In [14]:
from sklearn.model_selection import train_test_split
# Realizamos la partición de nuestro dataset en un conjunto de entrenamiento y otro de test (20%)
X_train, X_test, y_train, y_test = train_test_split(preprocessed_dataset, label, test_size=0.2, random_state=42)

Entrenamos y mostramos el resultado

In [15]:
from sklearn.svm import SVC

# Creamos el clasificador SVM lineal
classifier =  SVC()

# Realizamos el entrenamiento
classifier.fit(X_train, y_train)

# Obtenemos el accuracy de nuestro modelo para el conjunto de test
print(classifier.score(X_test, y_test))

0.8333333333333334
