# Análisis Exploratorio de Datos (EDA)

## Planteamiento del problema y recopilación de datos

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import json
import numpy as np
from numpy._core.defchararray import upper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
from sklearn.feature_selection import f_classif, SelectKBest



# Leer el archivo CSV
df = pd.read_csv('', sep=',') # Este archivo CSV contiene comas como separadores
print(df.head())
print(df.columns) 

hello


## Exploración y limpieza de datos

Comprobamos las dimensiones del dataframe y, además de si los datos concuerdan con el número total, vemos el tipo de dato y podemos separar entre variables categóricas y numéricas

In [None]:
print(f"Dimensiones del dataframe: {df.shape}")
print(df.info())

Procedemos a contabilizar los nulos y únicos: 

In [None]:
print(f"Valores null por columna: \n{df.isnull().sum()}")
print(f"Valores unicos por columna: \n{df.nunique()}")

#### Resumen de cada columna:

### Eliminamos las columnas que no nos aportan datos relevantes

Inicialmente comprobamos que no haya posibles duplicados en datos que generen conflictos

In [None]:
print(df.drop("", axis = 1).duplicated().sum())


total_dataf = df.drop([''], axis=1, inplace=False)
print(total_dataf.shape)
print(total_dataf.columns)
total_dataf.head()

## Análisis de variables univariante

### Generamos gráficos con los valores categóricos

In [None]:
fig, axis = plt.subplots(2, 2, figsize = (14, 8))

# Crear un histograma múltiple

sns.countplot(ax = axis[0, 0], data = total_dataf, x = "")
axis[0,0].tick_params(axis="x", rotation=70)
sns.countplot(ax = axis[0, 1], data = total_dataf, x = "").set(ylabel = None)
sns.countplot(ax = axis[1, 0], data = total_dataf, x = "").set(ylabel = None)

fig.delaxes(axis[1, 1])

# Ajustar el layout
plt.tight_layout()
plt.show()


# Mostrar el plot
plt.show()

### Generamos gráficos con los valores numéricos

In [None]:
fig, axis = plt.subplots(4, 4, figsize = (18, 12), gridspec_kw={'height_ratios': [4, 1, 4, 1]})

# Crear una figura múltiple con histogramas y diagramas de caja
sns.histplot(ax = axis[0, 0], data = total_dataf, x = "").set(xlabel = None)
sns.boxplot(ax = axis[1, 0], data = total_dataf, x = "")

sns.histplot(ax = axis[0, 1], data = total_dataf, x = "").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axis[1, 1], data = total_dataf, x = "")


# Ajustar el layout
plt.tight_layout()

# Mostrar el plot
plt.show()

## Análisis de variables multivariante

### Análisis numérico-numérico

Tomaremos el dato "y" como variable objetivo

In [None]:
fig, axis = plt.subplots(4, 4, figsize = (23, 8), gridspec_kw={'height_ratios': [3, 1, 3, 1]})

# Crear un diagrama de dispersión múltiple
sns.regplot(ax = axis[0, 0], data = total_dataf, x = "", y = "y")
sns.heatmap(total_dataf[["y", ""]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = total_dataf, x = "", y = "y").set(ylabel=None)
sns.heatmap(total_dataf[["y", ""]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1])


# Ajustar el layout
plt.tight_layout()

# Mostrar el plot
plt.show()

### Combinaciones entre variables numéricas

Vamos a explorar la relación entre: 

In [None]:
fig, axis = plt.subplots(2, 3, figsize = (20, 8), gridspec_kw={'height_ratios': [4, 1]})

# Crear un diagrama de dispersión múltiple
sns.regplot(ax = axis[0, 0], data = total_dataf, x = "", y = "")
sns.heatmap(total_dataf[["", ""]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)


# Ajustar el layout
plt.tight_layout()

# Mostrar el plot
plt.show()

### Combinaciones post-mapa de calor

### Análisis categórico-categórico

No se puede realizar un primer análisis con la variable objetivo ya que no sería lógico factorizar "price" por la gran cantidad de valores que crearía. Por lo tanto, seguiremos con el análisis.

### Combinaciones de la clase con varias predictoras

In [None]:
fig, axis = plt.subplots(figsize = (10, 5), ncols = 1)

sns.barplot(data = total_dataf, x = "", y = "", hue = "")
axis.tick_params(axis="x", rotation=50)

plt.tight_layout()

plt.show()

## Análisis de correlaciones (completo)

### Factorizar las variables categóricas

In [None]:
lista_a_factorizar = [""]

for var in lista_a_factorizar:
  url_transformado = "../data/processed/" + var + "_transformation_rules.json"
  var_n = var+"_n"

  total_dataf[var_n] = pd.factorize(total_dataf[var])[0]
  total_dataf[[var_n, var]]

  transformation_rules = {row[var]: row[var_n] for index, row in total_dataf[[var_n, var]].drop_duplicates().iterrows()}

  with open(url_transformado, "w") as f:
    json.dump(transformation_rules, f)

#### Mapa de calor

In [None]:
cols_num = [""]
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(total_dataf[cols_num].corr(method="pearson"), annot=True, fmt=".2f", cmap="viridis", ax=ax)
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(data = total_dataf)
plt.tight_layout()
plt.show()

## Feature engineering

### Análisis de outliers

In [None]:
FINAL_COLS = [""]
total_dataf = total_dataf[FINAL_COLS]
total_dataf.describe()

In [None]:
# Guardamos todos los datasets:

total_data_CON_outliers = total_dataf.copy()
total_data_SIN_outliers = total_dataf.copy()

outliers_cols = [""]

def replace_outliers(column, df):
  col_stats = total_dataf[column].describe()
  col_iqr = col_stats["75%"] - col_stats["25%"]
  upper_limit = round(float(col_stats["75%"] + 1.5 * col_iqr), 2)
  lower_limit = round(float(col_stats["25%"] - 1.5 * col_iqr), 2)

  if lower_limit < 0: lower_limit = min(df[column])
  # Vamos a quitar los outliers superiores
  df[column] = df[column].apply(lambda x: x if (x <= upper_limit) else upper_limit)
  # Vamos a quitar los outliers inferiores
  df[column] = df[column].apply(lambda x: x if (x >= lower_limit) else lower_limit)
  return df.copy(), [lower_limit, upper_limit]

outliers_dict = {}
for column in outliers_cols:
  total_data_SIN_outliers, limits = replace_outliers(column, total_data_SIN_outliers)
  outliers_dict.update({column: limits})

with open("../data/processed/outliers_dict.json", "w") as f:
  json.dump(outliers_dict, f)

### Análisis de valores faltantes

In [None]:
print(total_data_CON_outliers.isnull().sum().sort_values(ascending=False))
total_data_SIN_outliers.isnull().sum().sort_values(ascending=False)

mean(),median(),moda()

### Inferencia de nuevas características

### Escalado de valores

In [None]:
predictoras = [""]
target = ""

X_CON = total_data_CON_outliers.drop(target, axis = 1)[predictoras]
X_SIN = total_data_SIN_outliers.drop(target, axis = 1)[predictoras]
y = total_data_CON_outliers[target]

X_train_CON_outliers, X_test_CON_outliers, y_train, y_test = train_test_split(X_CON, y, test_size = 0.2, random_state = 10)
X_train_SIN_outliers, X_test_SIN_outliers = train_test_split(X_SIN, test_size = 0.2, random_state = 10)

In [None]:
# Normalización

norm_CON_outliers = StandardScaler()

norm_CON_outliers.fit(X_train_CON_outliers)

X_train_CON_outliers_norm = norm_CON_outliers.transform(X_train_CON_outliers)
X_train_CON_outliers_norm = pd.DataFrame(X_train_CON_outliers_norm, index = X_train_CON_outliers.index, columns = predictoras)

X_test_CON_outliers_norm = norm_CON_outliers.transform(X_test_CON_outliers)
X_test_CON_outliers_norm = pd.DataFrame(X_test_CON_outliers_norm, index = X_test_CON_outliers.index, columns = predictoras)

# SIN OUTLIERS
norm_SIN_outliers = StandardScaler()
norm_SIN_outliers.fit(X_train_SIN_outliers)

X_train_SIN_outliers_norm = norm_SIN_outliers.transform(X_train_SIN_outliers)
X_train_SIN_outliers_norm = pd.DataFrame(X_train_SIN_outliers_norm, index = X_train_SIN_outliers.index, columns = predictoras)

X_test_SIN_outliers_norm = norm_SIN_outliers.transform(X_test_SIN_outliers)
X_test_SIN_outliers_norm = pd.DataFrame(X_test_SIN_outliers_norm, index = X_test_SIN_outliers.index, columns = predictoras)


# ESCALADO MIN-MAX (MINMAXIMIZACIÓN)

scaler_CON_outliers = MinMaxScaler()
scaler_CON_outliers.fit(X_train_CON_outliers)

X_train_CON_outliers_scal = scaler_CON_outliers.transform(X_train_CON_outliers)
X_train_CON_outliers_scal = pd.DataFrame(X_train_CON_outliers_scal, index = X_train_CON_outliers.index, columns = predictoras)

X_test_CON_outliers_scal = scaler_CON_outliers.transform(X_test_CON_outliers)
X_test_CON_outliers_scal = pd.DataFrame(X_test_CON_outliers_scal, index = X_test_CON_outliers.index, columns = predictoras)

# SIN OUTLIERS
scaler_SIN_outliers = MinMaxScaler()
scaler_SIN_outliers.fit(X_train_SIN_outliers)

X_train_SIN_outliers_scal = scaler_SIN_outliers.transform(X_train_SIN_outliers)
X_train_SIN_outliers_scal = pd.DataFrame(X_train_SIN_outliers_scal, index = X_train_SIN_outliers.index, columns = predictoras)

X_test_SIN_outliers_scal = scaler_SIN_outliers.transform(X_test_SIN_outliers)
X_test_SIN_outliers_scal = pd.DataFrame(X_test_SIN_outliers_scal, index = X_test_SIN_outliers.index, columns = predictoras)


# Guardado de los datasets resultantes
X_train_CON_outliers.to_excel("../data/processed/X_train_CON_outliers.xlsx", index = False)
X_train_CON_outliers_norm.to_excel("../data/processed/X_train_CON_outliers_norm.xlsx", index = False)
X_train_CON_outliers_scal.to_excel("../data/processed/X_train_CON_outliers_scal.xlsx", index = False)
X_train_SIN_outliers.to_excel("../data/processed/X_train_SIN_outliers.xlsx", index = False)
X_train_SIN_outliers_norm.to_excel("../data/processed/X_train_SIN_outliers_norm.xlsx", index = False)
X_train_SIN_outliers_scal.to_excel("../data/processed/X_train_SIN_outliers_scal.xlsx", index = False)

X_test_CON_outliers.to_excel("../data/processed/X_test_CON_outliers.xlsx", index = False)
X_test_CON_outliers_norm.to_excel("../data/processed/X_test_CON_outliers_norm.xlsx", index = False)
X_test_CON_outliers_scal.to_excel("../data/processed/X_test_CON_outliers_scal.xlsx", index = False)
X_test_SIN_outliers.to_excel("../data/processed/X_test_SIN_outliers.xlsx", index = False)
X_test_SIN_outliers_norm.to_excel("../data/processed/X_test_SIN_outliers_norm.xlsx", index = False)
X_test_SIN_outliers_scal.to_excel("../data/processed/X_test_SIN_outliers_scal.xlsx", index = False)

y_train.to_excel("../data/processed/y_train.xlsx", index = False)
y_test.to_excel("../data/processed/y_test.xlsx", index = False)

# SCALERS

with open("../models/norm_CON_outliers.pkl", "wb") as file:
  pickle.dump(norm_CON_outliers, file)
with open("../models/norm_SIN_outliers.pkl", "wb") as file:
  pickle.dump(norm_SIN_outliers, file)
with open("../models/scaler_CON_outliers.pkl", "wb") as file:
  pickle.dump(scaler_CON_outliers, file)
with open("../models/scaler_SIN_outliers.pkl", "wb") as file:
  pickle.dump(scaler_SIN_outliers, file)

## Feature Selection

In [None]:
X_train = X_train_CON_outliers.copy()
X_test = X_test_CON_outliers.copy()

selection_model = SelectKBest(f_classif, k = 5)
selection_model.fit(X_train, y_train)

ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()