# Proyecto 1 - Regresion Lineal

In [None]:
# Importar las bibliotecas necesarias

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import linear_regression as lr	# Implementacion propia de regresion lineal

## a) Limpieza del dataset

In [None]:
# Cargar los datos
df = pd.read_csv('../data/AmesHousing.txt', sep='\t')

# Paso 1: Filtrar solo ventas 'Normales'
df = df[df['Sale Condition'] == 'Normal']

In [None]:

# Paso 2: Eliminar propiedades con área habitable > 1500 sq ft
df = df[df['Gr Liv Area'] <= 1500]

In [None]:
# Paso 3: Selección de muestra aleatoria (~200 observaciones)
sample_size = min(200, len(df))
df_cleaned = df.sample(n=sample_size, random_state=42)	# se establece una semilla para reproducibilidad

df_cleaned

In [None]:
# Guardar el resultado
df_cleaned.to_csv('../data/ames_housing_cleaned.txt', index=False)

## b) Normalización de los datos

In [None]:
# Uso:
df = pd.read_csv('../data/ames_housing_cleaned.txt')

# 2. Columnas a normalizar
numeric_cols = df.select_dtypes(include=['number']).columns
to_normalize = [col for col in numeric_cols if col not in ['PID', 'Order', 'Fireplaces']]

# 3. Aplicar Z-score
for col in to_normalize:
	mean = np.mean(df[col])
	std = np.std(df[col])
	if std != 0:  # Evitar división por cero
		df[col] = (df[col] - mean) / std
	else:
		df[col] = 0  # Manejar columnas constantes

df

## c) División Train/Test (80%-20%)

In [None]:
# Mezclar el dataset para evitar la división por orden
# Se utiliza una semilla para reproducibilidad
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calcular punto de corte
cutoff = int(0.8 * len(df))

# Dividir en train y test
train_df = df.iloc[:cutoff]
test_df = df.iloc[cutoff:]

# Separar características y objetivo
# Datos de entrenamiento
X_train = train_df.drop('SalePrice', axis=1)
y_train = train_df['SalePrice']

# Datos de prueba
X_test = test_df.drop('SalePrice', axis=1)
y_test = test_df['SalePrice']

## d) Modelos
### Modelo 0 - Total Basement + Gr Liv Area

In [None]:
# Datos de entrenamiento
X0_raw_train = X_train[['Total Bsmt SF', 'Gr Liv Area']].values

# Datos de prueba
X0_raw_test = X_test[['Total Bsmt SF', 'Gr Liv Area']].values

# Se mantiene y_test y y_train sin modificacion dado que son los valores objetivos

### Modelo 1 - Total Basement + Gr Liv Area + LotArea + Garage Cars + FireYN

In [None]:
# Crear un nuevo DataFrame a partir de X_train con la columna FireYN
X_train_fireyn = X_train.copy()
X_train_fireyn['FireYN'] = X_train_fireyn['Fireplaces'].apply(lambda x: 0 if x == 0 else 1)

# Crear un nuevo DataFrame a partir de X_test con la columna FireYN
X_test_fireyn = X_test.copy()
X_test_fireyn['FireYN'] = X_test_fireyn['Fireplaces'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
# Datos de entrenamiento 

X1_raw_train = X_train_fireyn[['Total Bsmt SF', 'Gr Liv Area', 'Lot Area', 'Garage Cars', 'FireYN']].values

# Datos de prueba
X1_raw_test = X_test_fireyn[['Total Bsmt SF', 'Gr Liv Area', 'Lot Area', 'Garage Cars', 'FireYN']].values

# Se mantiene y_test y y_train sin modificacion dado que son los valores objetivos

### Modelo 2 - Total Basement + Gr Liv Area + 3 atributos

In [None]:
# Calcular correlación con SalePrice

# Seleccionar solo columnas numéricas
numeric_df = df.select_dtypes(include=[np.number])

# Calcular correlación con SalePrice
correlations = numeric_df.corr()['SalePrice'].drop(
    ['SalePrice', 'Total Bsmt SF', 'Gr Liv Area', 'Lot Area', 'Garage Cars', 'Fireplaces'],
    errors='ignore'
)

# Seleccionar los 3 atributos con mayor correlación absoluta
top3 = correlations.abs().sort_values(ascending=False).head(3)
print(top3)

In [None]:
# Modelo 2 - Total Bsmt SF, Gr Liv Area + 3 atributos con mayor correlación

# Obtener los nombres de los 3 atributos seleccionados
top3_features = top3.index.tolist()

# Datos de entrenamiento
X2_raw_train = X_train[['Total Bsmt SF', 'Gr Liv Area'] + top3_features].values

# Datos de prueba
X2_raw_test = X_test[['Total Bsmt SF', 'Gr Liv Area'] + top3_features].values

# Mostrar los nombres de los atributos usados en el modelo 2
print("Atributos usados en Modelo 2:", ['Total Bsmt SF', 'Gr Liv Area'] + top3_features)

# Eliminar filas con valores vacíos en la columna 'Garage Yr Blt' en los conjuntos de entrenamiento y prueba
X2_train_mask = ~X_train[['Garage Yr Blt'] + top3_features].isnull().any(axis=1)
X2_test_mask = ~X_test[['Garage Yr Blt'] + top3_features].isnull().any(axis=1)

# Filtrar X2_raw_train, X2_raw_test, y los objetivos correspondientes
X2_raw_train = X2_raw_train[X2_train_mask.values]
y_train_clean = y_train[X2_train_mask]

X2_raw_test = X2_raw_test[X2_test_mask.values]
y_test_clean = y_test[X2_test_mask]

## e) Evaluación de resultados
### Modelo 0

In [None]:
# Inicializar theta
theta = np.zeros(X0_raw_train.shape[1])

# Parámetros
alpha = 0.0001
num_iters = 50000	# modificacion del valor para controlar las iteraciones

# Ejecutar descenso del gradiente
theta_final, bias_final, cost_history = lr.gradient_descent(X0_raw_train, y_train, theta, alpha, num_iters)

# Predicciones
train_predictions = X0_raw_train.dot(theta_final) + bias_final
test_predictions = X0_raw_test.dot(theta_final) + bias_final

# Métricas de evaluación del modelo (con 5 decimales)
# Bias (Avg(prediction - y))
metric_bias = np.mean(train_predictions - y_train)
metric_bias_test = np.mean(test_predictions - y_test)
print(f"Bias (Train): {metric_bias:.5f}, Bias (Test): {metric_bias_test:.5f}")

# Maximum Deviation (Max|y - prediction|)
metric_max_deviation = np.max(np.abs(y_train - train_predictions))
metric_max_deviation_test = np.max(np.abs(y_test - test_predictions))
print(f"Maximum Deviation (Train): {metric_max_deviation:.5f}, Maximum Deviation (Test): {metric_max_deviation_test:.5f}")

# Mean Absolute Deviation (Avg|y - prediction|)
metric_mean_absolute_deviation = np.mean(np.abs(y_train - train_predictions))
metric_mean_absolute_deviation_test = np.mean(np.abs(y_test - test_predictions))
print(f"Mean Absolute Deviation (Train): {metric_mean_absolute_deviation:.5f}, Mean Absolute Deviation (Test): {metric_mean_absolute_deviation_test:.5f}")

# Mean Squared Error (Avg(y - prediction)^2)
metric_mean_squared_error = np.mean((y_train - train_predictions) ** 2)
metric_mean_squared_error_test = np.mean((y_test - test_predictions) ** 2)
print(f"Mean Squared Error (Train): {metric_mean_squared_error:.5f}, Mean Squared Error (Test): {metric_mean_squared_error_test:.5f}")


### Modelo 1

In [None]:
# Inicializar theta
theta = np.zeros(X1_raw_train.shape[1])

# Parámetros
alpha = 0.0001
num_iters = 50000	# modificacion del valor para controlar las iteraciones

# Ejecutar descenso del gradiente
theta_final, bias_final, cost_history = lr.gradient_descent(X1_raw_train, y_train, theta, alpha, num_iters)

# Predicciones
train_predictions = X1_raw_train.dot(theta_final) + bias_final
test_predictions = X1_raw_test.dot(theta_final) + bias_final

# Métricas de evaluación del modelo (con 5 decimales)
# Bias (Avg(prediction - y))
metric_bias = np.mean(train_predictions - y_train)
metric_bias_test = np.mean(test_predictions - y_test)
print(f"Bias (Train): {metric_bias:.5f}, Bias (Test): {metric_bias_test:.5f}")

# Maximum Deviation (Max|y - prediction|)
metric_max_deviation = np.max(np.abs(y_train - train_predictions))
metric_max_deviation_test = np.max(np.abs(y_test - test_predictions))
print(f"Maximum Deviation (Train): {metric_max_deviation:.5f}, Maximum Deviation (Test): {metric_max_deviation_test:.5f}")

# Mean Absolute Deviation (Avg|y - prediction|)
metric_mean_absolute_deviation = np.mean(np.abs(y_train - train_predictions))
metric_mean_absolute_deviation_test = np.mean(np.abs(y_test - test_predictions))
print(f"Mean Absolute Deviation (Train): {metric_mean_absolute_deviation:.5f}, Mean Absolute Deviation (Test): {metric_mean_absolute_deviation_test:.5f}")

# Mean Squared Error (Avg(y - prediction)^2)
metric_mean_squared_error = np.mean((y_train - train_predictions) ** 2)
metric_mean_squared_error_test = np.mean((y_test - test_predictions) ** 2)
print(f"Mean Squared Error (Train): {metric_mean_squared_error:.5f}, Mean Squared Error (Test): {metric_mean_squared_error_test:.5f}")


### Modelo 2

In [None]:
# Inicializar theta
theta = np.zeros(X2_raw_train.shape[1])

# Parámetros
alpha = 0.0001
num_iters = 50000	# modificacion del valor para controlar las iteraciones

# Ejecutar descenso del gradiente
theta_final, bias_final, cost_history = lr.gradient_descent(X2_raw_train, y_train_clean, theta, alpha, num_iters)

# Predicciones
train_predictions = X2_raw_train.dot(theta_final) + bias_final
test_predictions = X2_raw_test.dot(theta_final) + bias_final

# Métricas de evaluación del modelo (con 5 decimales)
# Bias (Avg(prediction - y))
metric_bias = np.mean(train_predictions - y_train_clean)
metric_bias_test = np.mean(test_predictions - y_test_clean)
print(f"Bias (Train): {metric_bias:.5f}, Bias (Test): {metric_bias_test:.5f}")

# Maximum Deviation (Max|y - prediction|)
metric_max_deviation = np.max(np.abs(y_train_clean - train_predictions))
metric_max_deviation_test = np.max(np.abs(y_test_clean - test_predictions))
print(f"Maximum Deviation (Train): {metric_max_deviation:.5f}, Maximum Deviation (Test): {metric_max_deviation_test:.5f}")

# Mean Absolute Deviation (Avg|y - prediction|)
metric_mean_absolute_deviation = np.mean(np.abs(y_train_clean - train_predictions))
metric_mean_absolute_deviation_test = np.mean(np.abs(y_test_clean - test_predictions))
print(f"Mean Absolute Deviation (Train): {metric_mean_absolute_deviation:.5f}, Mean Absolute Deviation (Test): {metric_mean_absolute_deviation_test:.5f}")

# Mean Squared Error (Avg(y - prediction)^2)
metric_mean_squared_error = np.mean((y_train_clean - train_predictions) ** 2)
metric_mean_squared_error_test = np.mean((y_test_clean - test_predictions) ** 2)
print(f"Mean Squared Error (Train): {metric_mean_squared_error:.5f}, Mean Squared Error (Test): {metric_mean_squared_error_test:.5f}")


### Comparacion entre modelos

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Nombres de las métricas
metrics = [
    "Bias",
    "Maximum Deviation",
    "Mean Absolute Deviation",
    "Mean Squared Error"
]

# Valores de ejemplo: reemplaza estos valores por los de tus modelos
# Cada lista: [Modelo 0, Modelo 1, Modelo 2]
train_values = [
    [0.12576, 0.11023, 0.09876],   # Bias (Train)
    [4.20734, 3.87654, 3.54321],   # Max Deviation (Train)
    [0.77744, 0.65432, 0.61234],   # MAE (Train)
    [1.06356, 0.98765, 0.87654]    # MSE (Train)
]
test_values = [
    [-0.00318, 0.01234, 0.02345],  # Bias (Test)
    [2.51632, 2.34567, 2.12345],   # Max Deviation (Test)
    [0.68818, 0.54321, 0.51234],   # MAE (Test)
    [0.79456, 0.76543, 0.65432]    # MSE (Test)
]

model_labels = ['Modelo 0', 'Modelo 1', 'Modelo 2']
x = np.arange(len(metrics))  # posiciones de las métricas
width = 0.22  # ancho de las barras

# --- Gráfica para entrenamiento ---
fig, ax = plt.subplots(figsize=(10, 6))
for i, model in enumerate(model_labels):
    ax.bar(x + (i-1)*width, [train_values[j][i] for j in range(len(metrics))], width, label=model)

ax.set_ylabel('Valor')
ax.set_title('Comparación de métricas de entrenamiento entre modelos')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=15)
ax.legend()
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# --- Gráfica para prueba ---
fig, ax = plt.subplots(figsize=(10, 6))
for i, model in enumerate(model_labels):
    ax.bar(x + (i-1)*width, [test_values[j][i] for j in range(len(metrics))], width, label=model)

ax.set_ylabel('Valor')
ax.set_title('Comparación de métricas de prueba entre modelos')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=15)
ax.legend()
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()