# Autoencoder para reduccion de dimension del dataset:
Se realizara buscara reducir el numero de columnas del dataset: de 1000 columnas a 100.

In [1]:
# Pandas - Numpy (import data, manipulation)
import pandas as pd
import numpy as np

# Scikit-learn (train|test split, scaler)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TensorFlow - Keras (Model, metrics, transformations and layers)
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.metrics import RootMeanSquaredError,MeanAbsoluteError
from tensorflow.keras.models import Model
from keras.utils.np_utils import to_categorical   

In [2]:
data = pd.read_csv("Data/data_final.txt",sep="\t") # Importamos el dataset obtenido en la seccion de limpieza de datos.
data.head()

Unnamed: 0,name,spect_b,0.45,0.452002002002002,0.45400400400400404,0.456006006006006,0.458008008008008,0.46001001001001,0.46201201201201203,0.46401401401401404,...,2.431981981981982,2.433983983983984,2.4359859859859863,2.437987987987988,2.43998998998999,2.441991991991992,2.443993993993994,2.4459959959959963,2.447997997997998,2.45
0,Abehiroshi,V,0.9765,0.873727,0.802019,0.908693,0.992507,0.892887,0.806158,0.822114,...,1.453882,1.428696,1.286868,1.095002,0.999979,1.100082,1.281601,1.382604,1.370983,1.3453
1,Senrikyu,B,1.1423,1.104633,1.062175,0.996224,0.949799,1.010879,1.060699,1.024171,...,0.991043,0.997508,1.00128,1.001278,1.000005,0.99957,0.999831,1.000092,1.000091,1.0
2,Robinson,Sr,0.8445,0.770504,0.786651,0.801385,0.780085,0.812111,0.815159,0.828218,...,1.696828,1.687543,1.699685,1.727674,1.741618,1.724284,1.714307,1.760465,1.81774,1.7467
3,Paris,T,0.9238,0.935045,0.928104,0.933731,0.943692,0.937315,0.953128,0.96719,...,2.056997,1.990262,1.979909,2.040457,1.958246,1.594457,1.15588,0.92645,0.944271,1.0
4,Farinella,D,0.8795,0.953828,0.956625,0.849646,0.784283,0.867913,0.938608,0.909334,...,1.48228,1.065135,1.07722,1.503595,1.945946,2.089067,2.025298,1.945494,1.926199,1.9289


In [3]:
target = ['spect_b'] # Seleccionamos la columna predictora
#predictors = data.columns.values.tolist()[2:] #
target_values = data[target]['spect_b'].unique() # Seleccionamos las categorias que puede tener cada asteroide, son 23 en total.
target_numeric = np.arange(0,len(target_values)) # Se crea una lista de numeros enteros entre 0 y 23
dic_target = {target_values[i]:target_numeric[i] for i in range(len(target_numeric)) } # Se construye un diccionario que asigne cada numero a una categoria

In [4]:
data["spect_b_numeric"] = data["spect_b"].apply(lambda x: dic_target[x] ) # Se crea una nueva columna de categorias pero numericas.
data.drop(["name","spect_b"],axis=1,inplace=True) # Se deja solamente la columna categorica numerica (al final)
data.head()

Unnamed: 0,0.45,0.452002002002002,0.45400400400400404,0.456006006006006,0.458008008008008,0.46001001001001,0.46201201201201203,0.46401401401401404,0.46601601601601605,0.468018018018018,...,2.433983983983984,2.4359859859859863,2.437987987987988,2.43998998998999,2.441991991991992,2.443993993993994,2.4459959959959963,2.447997997997998,2.45,spect_b_numeric
0,0.9765,0.873727,0.802019,0.908693,0.992507,0.892887,0.806158,0.822114,0.82498,0.798456,...,1.428696,1.286868,1.095002,0.999979,1.100082,1.281601,1.382604,1.370983,1.3453,0
1,1.1423,1.104633,1.062175,0.996224,0.949799,1.010879,1.060699,1.024171,1.0308,1.106784,...,0.997508,1.00128,1.001278,1.000005,0.99957,0.999831,1.000092,1.000091,1.0,1
2,0.8445,0.770504,0.786651,0.801385,0.780085,0.812111,0.815159,0.828218,0.823612,0.821009,...,1.687543,1.699685,1.727674,1.741618,1.724284,1.714307,1.760465,1.81774,1.7467,2
3,0.9238,0.935045,0.928104,0.933731,0.943692,0.937315,0.953128,0.96719,0.969832,0.959735,...,1.990262,1.979909,2.040457,1.958246,1.594457,1.15588,0.92645,0.944271,1.0,3
4,0.8795,0.953828,0.956625,0.849646,0.784283,0.867913,0.938608,0.909334,0.893468,0.906256,...,1.065135,1.07722,1.503595,1.945946,2.089067,2.025298,1.945494,1.926199,1.9289,4


In [5]:
data.shape # Tenemos 160 asteroides con 1000 longitudes de onda y una columna asociada a cada categoria.

(160, 1001)

In [6]:
data.spect_b_numeric.value_counts() # Se observa un gran sesgo en las categorias. esto debido a la falta de asteroides con nombre que se encuentran clasificados.

7     40
18    19
19    13
8     12
17     8
1      8
20     7
11     6
3      6
5      5
14     5
21     4
6      4
15     4
2      3
9      3
0      2
16     2
13     2
12     2
10     2
4      2
22     1
Name: spect_b_numeric, dtype: int64

In [7]:
# Se seleccionaran unicamente los asteroides con las categorias:

df_filter = data[(
     data.spect_b_numeric == 7) | # "S"
     (data.spect_b_numeric == 18) | # "Ch"
     (data.spect_b_numeric == 19) | # "X"
     (data.spect_b_numeric == 8) |  # "C"
     (data.spect_b_numeric == 1) |  # "B"
     (data.spect_b_numeric == 17)   # "Sl"
]
df_filter # De esta forma se limitaran las categorias que no se encuentran tan sesgados

Unnamed: 0,0.45,0.452002002002002,0.45400400400400404,0.456006006006006,0.458008008008008,0.46001001001001,0.46201201201201203,0.46401401401401404,0.46601601601601605,0.468018018018018,...,2.433983983983984,2.4359859859859863,2.437987987987988,2.43998998998999,2.441991991991992,2.443993993993994,2.4459959959959963,2.447997997997998,2.45,spect_b_numeric
1,1.1423,1.104633,1.062175,0.996224,0.949799,1.010879,1.060699,1.024171,1.030800,1.106784,...,0.997508,1.001280,1.001278,1.000005,0.999570,0.999831,1.000092,1.000091,1.0000,1
9,0.8460,0.806046,0.806720,0.844336,0.859028,0.837936,0.842385,0.856531,0.857605,0.849033,...,1.530926,1.507761,1.479249,1.483332,1.535000,1.552095,1.433121,1.200224,1.0000,7
10,0.8148,0.783991,0.832539,0.851162,0.814888,0.831912,0.846885,0.857243,0.858247,0.855224,...,1.645403,1.720694,1.774373,1.672133,1.350816,1.047267,1.061254,1.384984,1.7011,7
11,0.9271,0.914190,0.932000,0.948292,0.950620,0.976259,0.973121,0.990699,1.013024,0.985030,...,1.165133,1.106337,1.033984,0.999985,1.039089,1.112172,1.162763,1.166991,1.1338,8
12,0.8547,0.852304,0.850983,0.857229,0.858517,0.859017,0.865424,0.871978,0.881150,0.889311,...,1.359801,1.347493,1.334237,1.344745,1.385816,1.388723,1.271191,1.081631,1.0000,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.8446,0.852892,0.855081,0.855628,0.860038,0.866210,0.864740,0.869041,0.876262,0.881840,...,1.311641,1.247583,1.123434,1.000504,0.935129,0.953418,1.070645,1.228319,1.2937,7
154,0.8858,0.905882,0.861419,0.850516,0.886861,0.880098,0.863066,0.879971,0.890077,0.882779,...,1.065867,0.958425,0.941561,0.999554,1.108919,1.227369,1.308741,1.327765,1.2802,7
155,0.8504,0.854994,0.850766,0.856934,0.871354,0.867807,0.877338,0.884674,0.893962,0.901906,...,1.300946,1.305340,1.307126,1.306905,1.305947,1.307715,1.315945,1.327674,1.3332,7
158,0.9835,0.974798,0.991299,0.998091,0.984631,0.985900,0.981809,0.981263,0.990585,0.994514,...,0.878581,0.877612,0.874644,0.872900,0.874630,0.877601,0.878572,0.877348,0.8768,1


Debido al sesgo en numero de categorias con menos de 8 muestras, se opto por seleccionar unicamente las primeras 6 categorias con mayor cantidad de muestras.

In [8]:
df_filter.shape # Se observa una reduccion de los datos de 160 a 100 asteroides

(100, 1001)

In [9]:
y = np.int32(df_filter.to_numpy()[:,-1]) # Seleccionamos los valores de la columna predictora numerica
np.unique(y) # vemos los valores unicos.

array([ 1,  7,  8, 17, 18, 19])

Es necesario convertir estos numeros a valores entre 0-9 para realizar el one hot encoding

In [10]:
y = np.where(y == 1,0,y) # Se asignara a los valores 1 el numero 0
y = np.where(y == 7,1,y) # Se asignara a los valores 7 el numero 1
y = np.where(y == 8,2,y) # Se asignara a los valores 8 el numero 2
y = np.where(y == 17,3,y) # Se asignara a los valores 17 el numero 3
y = np.where(y == 18,4,y) # Se asignara a los valores 18 el numero 4
y = np.where(y == 19,5,y) # Se asignara a los valores 19 el numero 5

In [11]:
y_one_hot = to_categorical(y, num_classes=6) # Transformamos la columna predictora usando un one hot encoding para las 6 clases.
pd.DataFrame(y_one_hot).to_csv("Data/y_data.csv",index=False) # Se guarda estos datos para utilizarlos luego.

In [12]:
X = df_filter.to_numpy()[:,:-1] # Seleccionamos los valores de las 1000 longitudes de onda.
pd.DataFrame(X).to_csv("Data/X_data.csv",index=False) # Guardamos estos valores para utilizarlos luego.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=500) # Realizamos un split de los datos a entrenamiento y prueba. Random state = 27
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(80, 1000)
(80, 6)
(20, 1000)
(20, 6)


In [14]:
pd.DataFrame(y_train).to_csv("Data/y_train.csv",index=False)
pd.DataFrame(y_test).to_csv("Data/y_test.csv",index=False)

Se utiliza Standard Scaler debido a que se encontro en referencias que este es un escalador optimo para realizar clasificacion. Un ejemplo de esto es que fue el utilizado en el proyecto de mnist

In [15]:
scaler = StandardScaler() # Se construye una funcion para escalar los datos predictores "X" tanto para training como testing utilizando un Standard Scaler.
X_train_scaled = scaler.fit_transform(X_train) # valor escalado de X_train
X_test_scaled = scaler.fit_transform(X_test) # valor escalado de X_test

In [16]:
encoding_dim = 100 # Seleccionamos la cantidad de columnas que queremos reducir en el autoencoding.
batch_size = 32
epochs = 100
Metrics = [RootMeanSquaredError(name='rms'), MeanAbsoluteError(name='mae')]


def trainAutoencoder(Xtrain, metrics = Metrics): 
    input_output = Xtrain.shape[-1] # Seleccionamos el numero de columnas, en nuestro caso son 1000
    input_ = Input(shape=(input_output,)) # Recreamos la tupla X_train[0].shape --> (1000,)
    encoded = Dense(units=encoding_dim*2,activation="relu")(input_) # Primera capa sera de tamaño 100*2, con entrada el X_train_scaled
    bottleneck = Dense(units=encoding_dim,activation="relu")(encoded) # Segunda capa sera de tamaño 100, con entrada encoded
    decoded = Dense(units=encoding_dim*2,activation="relu")(bottleneck) # Tercera capa sera de tamaño 100*2, con entrada bottleneck
    output = Dense(units=input_output,activation='sigmoid')(decoded) # Output tendra tamaño 1000, con activacion sigmoid y entrada decoded

    autoencoder = Model(inputs=input_, outputs=output) # Se define el modelo 
    autoencoder.compile(optimizer='sgd', loss='mean_squared_error', metrics=[metrics]) # Se utilizara sgd, mean_squared_error y las metricas definidas
    autoencoder.fit(Xtrain,Xtrain,batch_size=batch_size,epochs=epochs, verbose=0) # Se entrena el modelo respecto a las X ya que es lo que se busca reducir.
    
    encoder = Model(inputs=input_,outputs=bottleneck) # Modelo que muestra el output del encoded
    return autoencoder,encoder


In [17]:
_,encoded = trainAutoencoder(X_train_scaled) # Seleccionamos el encoded que muestra el dataset reducido.
data_test_final = encoded.predict(X_test_scaled) # Observamos como fue el resultado del autoencoding para los X_test.
data_train_final = encoded.predict(X_train_scaled) # Observamos como fue el resultado del autoencoding para los X_train.
pd.DataFrame(data_test_final).to_csv("Data/X_test_autoencoded1.csv",index=False) # Guardamos los X_Test resultados del autoencoding
pd.DataFrame(data_train_final).to_csv("Data/X_data_autoencoded1.csv",index=False) # Guardamos los X_train resultados del autoencoding

In [18]:
print(pd.DataFrame(data_test_final).shape) # vemos que se redujo de 1000 a 100 columnas utilizando autoencoders
print(pd.DataFrame(data_train_final).shape)

(20, 100)
(80, 100)
