# Descripción del dataset original

## Carga del dataset

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

In [None]:
#from scipy.io.arff import loadarff
# raw_data = loadarff('../eeg-eye-state.arff')
# dataset = pd.DataFrame(raw_data[0])
# dataset.to_csv('../eeg-eye-state.csv')
# # + eliminar primera columna de indice

In [None]:
dataset = pd.read_csv('./eeg-eye-state.csv')

In [None]:
dataset

## Descripción de las variables

In [None]:
dataset.describe()

In [None]:
dataset.keys()

In [None]:
# https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot

description = dataset.describe()[[col for col in dataset.columns if col != 'Class']]

fig = plt.figure()
ax = plt.subplot(111)

ax.set_yscale('log')
ax.plot(description, label=description.columns)

box = ax.get_position()
ax.set_position([
    box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
ax.legend(
    loc='upper center', bbox_to_anchor=(0.5, -0.05),
    ncol=5, fancybox=True, shadow=True)
plt.show()

In [None]:
sns.set_theme(style="ticks")
sns.pairplot(dataset, hue="Class")
plt.savefig('./resultados/pairplot_variables.svg', format='svg',
    dpi=400, bbox_inches='tight')

In [None]:
sns.set_theme(style="ticks")

atrs = ['V3', 'V6', 'V9']
outliers = dataset['Class'] < -1  # Establecer todo a False
for a in atrs:
    v = dataset[a]
    outliers |= (v > (v.mean() + 3 * v.std())) | (v < (v.mean() - 3 * v.std()))

sns.pairplot(dataset[[*atrs, 'Class']][~outliers], hue="Class")
plt.savefig('./resultados/pairplot_variables.svg', format='svg',
    dpi=400, bbox_inches='tight')

In [None]:
dataset.corr()

## Importancia de las variables

In [None]:
# https://stackabuse.com/get-feature-importances-for-random-forests-with-python-and-scikit-learn/

In [None]:
x, y = dataset.loc[:, dataset.columns != 'Class'], dataset['Class'].astype(int)

In [None]:
rf = RandomForestClassifier(n_estimators=300)
_ = rf.fit(x, y)  # ~15 sec.

In [None]:
plt.bar(rf.feature_names_in_, rf.feature_importances_)
plt.show()

In [None]:
correlation = dataset.corr().abs()  # Interesan los que tengan más correlación (positiva o negativa)
sns.heatmap(correlation, cmap='coolwarm')

Nos fijamos en las correlaciones más cercanas a 1 (o a -1 si las hubiera)

In [None]:
correlation[['V1', 'V9', 'V13']].loc[[(c in ('V1', 'V9', 'V13')) for c in correlation.columns]]

In [None]:
correlation[['V4', 'V7']].loc[[(c in ('V4', 'V7')) for c in correlation.columns]]

In [None]:
correlation[['V6', 'V14']].loc[[(c in ('V6', 'V14')) for c in correlation.columns]]

Vemos que estos pares tienen una correlación de prácticamente 1:
- V1-V9
- V9-V13
- V1-V13
- V6-V14
- V4-V7

Por lo tanto podríamos prescindir de todos los atributos excepto uno de cada grupo:
- V1-V9-V13
- V4-V7
- V6-14

Eliminamos V7, V9, V13 y V14.

In [None]:
new_dataset = dataset[[col for col in dataset.columns if col not in ('V7', 'V9', 'V13', 'V14')]]
new_dataset

In [None]:
new_x, new_y = new_dataset.loc[:, new_dataset.columns != 'Class'], new_dataset['Class'].astype(int)

In [None]:
new_rf = RandomForestClassifier(n_estimators=300)
_ = new_rf.fit(new_x, new_y)  # ~15 sec.

In [None]:
plt.bar(new_rf.feature_names_in_, new_rf.feature_importances_)
plt.show()

In [None]:
# https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot

new_description = new_dataset.describe()[[col for col in new_dataset.columns if col != 'Class']]

fig = plt.figure()
ax = plt.subplot(111)

ax.set_yscale('log')
ax.plot(new_description, label=new_description.columns)

box = ax.get_position()
ax.set_position([
    box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
ax.legend(
    loc='upper center', bbox_to_anchor=(0.5, -0.05),
    ncol=5, fancybox=True, shadow=True)
plt.show()

In [None]:
new_correlation = new_dataset.corr().abs()
sns.heatmap(new_correlation, cmap='coolwarm')

---

In [None]:
features = ['V6', 'V9', ]
fmin, fmax = 4000, 4800
limin, limax = 4000, 4800

class_values = [0, 1]
class_colors = ['#1e64fa', '#fa641e']

fsize = 19
plots = (1, len(features))
feature = dataset[features[0]]

img, _s = plt.subplots(*plots, figsize=(13, 6))
img.tight_layout()
clase = dataset['Class']
datapresent = (feature > fmin) & (feature < fmax)

for i, f in enumerate(features):
    plt.subplot(*plots, i + 1)
    for cv, color in zip(class_values, class_colors):
        sns.kdeplot(
            dataset[datapresent & (clase == cv)][[f]],
            palette=[color], label=str(cv), legend=(i + 1 == len(features)), )
        plt.title(f, fontsize=fsize)
    plt.ylabel('')
    plt.xlim(limin, limax)

plt.legend(fontsize=fsize)
plt.savefig(
    f'./resultados/comparacion_densidad_{"_".join(features)}.svg',
    format='svg', dpi=400, bbox_inches='tight')

#plt.show()


In [None]:
features = ['V6', 'V9']
fmin, fmax = 4500, 4800

class_values = [0, 1]
class_colors = ['#1e64fa', '#fa641e']

fsize = 19
plots = (1, len(features))
feature = dataset[features[0]]

img, _s = plt.subplots(*plots, figsize=(13, 6))
img.tight_layout()
clase = dataset['Class']
datapresent = (feature > fmin) & (feature < fmax)

for i, f in enumerate(features):
    plt.subplot(*plots, i + 1)
    for cv, color in zip(class_values, class_colors):
        sns.kdeplot(
            dataset[datapresent & (clase == cv)][[f]],
            palette=[color], label=str(cv), legend=(i + 1 == len(features)), )
        plt.title(f, fontsize=fsize)
    plt.ylabel('')

plt.legend(fontsize=fsize)
plt.savefig(
    f'./resultados/comparacion_densidad_{"_".join(features)}.svg',
    format='svg', dpi=400, bbox_inches='tight')

#plt.show()


---