In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.preprocess import load_data, clean_dimensions, encode_features, align_features
from src.model import train_model, predict

# Cargar datos
train_path = '../data/diamonds_train.csv'
test_path = '../data/diamonds_test.csv'
df_train, df_test = load_data(train_path, test_path)

# EDA
print(df_train.info())
print(df_train.describe())
print(df_train['cut'].value_counts())
print(df_train['color'].value_counts())
print(df_train['clarity'].value_counts())

sns.histplot(df_train['price'], kde=True)
plt.title('Distribución del Precio')
plt.show()

sns.scatterplot(x='carat', y='price', data=df_train)
plt.title('Relación entre carat y price')
plt.show()

sns.heatmap(df_train.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Matriz de Correlación')
plt.show()

sns.boxplot(x='cut', y='price', data=df_train)
plt.title('Precio por Tipo de Corte')
plt.show()

sns.boxplot(x='clarity', y='price', data=df_train)
plt.title('Precio por Claridad')
plt.show()

# Preprocesamiento
categorical_columns = ['cut', 'color', 'clarity']
df_train = clean_dimensions(df_train)
X = df_train.drop('price', axis=1)
y = df_train['price']
X = encode_features(X, categorical_columns)
X_test = df_test.drop('id', axis=1)
X_test = encode_features(X_test, categorical_columns)
X, X_test = align_features(X, X_test)

# Entrenar modelo y predecir
model = train_model(X, y)
predictions = predict(model, X_test)

# Guardar prediccion
submission = pd.DataFrame({'id': df_test['id'], 'price': predictions})
submission.to_csv('../outputs/final_submission.csv', index=False)
