<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Keras Clasificación

# Redes neuronales de una sola capa oculta (ANN)

In [None]:
import os
import platform

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

from  sklearn import  datasets

## 1 - Perceptron & Clasificación binaria

In [None]:
df_drugs = pd.DataFrame({
      "Age": [0, 1, 2, 0],
      "cholesterol": [0, 1 , 1, 0],
      "drug": [0, 1, 1, 0]}
      )
df_drugs

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_norm = df_drugs.copy()
df_norm.loc[:, 'Age'] = scaler.fit_transform(df_norm[['Age']])
df_norm.loc[:, 'cholesterol'] = scaler.fit_transform(df_norm[['cholesterol']])
df_norm.head()

In [None]:
X = df_norm.drop('drug', axis=1).values
y = df_norm['drug'].values

In [None]:
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=1, activation='sigmoid', input_shape=(2,)))
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X, y, epochs=10)

In [None]:
# Podemos observar los pesos asociados al modelo
model.get_weights()

In [None]:
# Evaluar el accuracy del modelo
accuracy = model.evaluate(X, y)[1]

## 2 - Red neuronal (ANN) & clasificación multicategorica

### `Penguins dataset`:
El dataset **`Penguins`** es un dataset alternativo al clásico dataset de **`iris`**, el cual se lo utiliza para clasificación multicagórica (3 especies de pinguinos). Cada especie se caracteriza por su tamaño, como podrá ver en el dataset.<br> [Dataset source](https://www.kaggle.com/parulpandey/penguin-dataset-the-new-iris/data)

In [None]:
if os.access('penguins_dataset.csv', os.F_OK) is False:
    if platform.system() == 'Windows':
        !curl https://raw.githubusercontent.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/main/datasets/penguins_dataset.csv > penguins_dataset.csv
    else:
        !wget penguins_dataset.csv https://raw.githubusercontent.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/main/datasets/penguins_dataset.csv

In [None]:
df2 = pd.read_csv("penguins_dataset.csv")
df2.head()

In [None]:
# Analizar cantidad de faltantes
des = df2.describe()
des.loc['Nan'] = df2.isna().sum()
des.loc['%Nan'] = (df2.isna().mean())*100
des

In [None]:
# Eliminar faltantes y seleccionar columnas
df2_clean = df2[["Species", "Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"]]
df2_clean = df2_clean.dropna()
df2_clean.reset_index(drop=True, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df2_norm = df2_clean.copy()
df2_norm["target"] = le.fit_transform(df2_norm['Species'])
df2_norm = df2_norm.drop(["Species"], axis=1)

from sklearn.preprocessing import StandardScaler
scaler2 = StandardScaler()
df2_norm.loc[:, 'Culmen Length (mm)'] = scaler2.fit_transform(df2_norm[['Culmen Length (mm)']])
df2_norm.loc[:, 'Culmen Depth (mm)'] = scaler2.fit_transform(df2_norm[['Culmen Depth (mm)']])
df2_norm.loc[:, 'Culmen Depth (mm)'] = scaler2.fit_transform(df2_norm[['Culmen Depth (mm)']])
df2_norm.loc[:, 'Flipper Length (mm)'] = scaler2.fit_transform(df2_norm[['Flipper Length (mm)']])
df2_norm.loc[:, 'Body Mass (g)'] = scaler2.fit_transform(df2_norm[['Body Mass (g)']])

df2_norm.head()

In [None]:
X2 = df2_norm.drop("target", axis=1).values
y2 = to_categorical(df2_norm["target"].values)

In [None]:
X2.shape

In [None]:
y2.shape

In [None]:
from sklearn.model_selection import train_test_split
# Fijamos un "random_state" constante para que siempre el dataset se parta de la misma forma
# para poder repetir los ensayos
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.layers import Dense

model2 = Sequential()
model2.add(Dense(units=3, activation='sigmoid', input_shape=(4,)))
model2.add(Dense(units=3, activation='softmax'))
model2.summary()

In [None]:
model2.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history2 = model2.fit(X2_train, y2_train, validation_split=0.2, epochs=100)

In [None]:
epoch_count = range(1, len(history2.history['accuracy']) + 1)
sns.lineplot(x=epoch_count,  y=history2.history['accuracy'], label='train')
sns.lineplot(x=epoch_count,  y=history2.history['val_accuracy'], label='valid')
plt.show()