In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
# Load the data
train_df = pd.read_csv("train.csv", delimiter=";")
test_df = pd.read_csv("test.csv", delimiter=";")

# Save test IDs for submission
test_ids = test_df["id"]

In [None]:
# Separate features and target
X = train_df.drop(columns=["quality"])
y = train_df["quality"]

In [None]:
# One-hot encode the 'type' column
X = pd.get_dummies(X, columns=["type"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["type"], drop_first=True)

In [None]:
# Ensure test set has the same features as train set
missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[X.columns]

In [None]:
# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

In [None]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost model with optimized hyperparameters
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Validate model performance
y_val_pred = xgb_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse}")

In [None]:
# Predict on test set
test_predictions = xgb_model.predict(test_scaled)

In [None]:
# Create submission file
submission = pd.DataFrame({"id": test_ids, "quality": test_predictions})
submission.to_csv("submission.csv", index=False)

print("submission.csv has been created!")