In [2]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install xgboost
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error

In [4]:
! mkdir ~/.kaggle

In [5]:
!cp /content/drive/MyDrive/CollabData/kaggle_API/kaggle.json ~/.kaggle/kaggle.json

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle competitions download home-data-for-ml-course

Downloading home-data-for-ml-course.zip to /content
  0% 0.00/386k [00:00<?, ?B/s]
100% 386k/386k [00:00<00:00, 62.5MB/s]


In [8]:
! unzip home-data-for-ml-course.zip

Archive:  home-data-for-ml-course.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: sample_submission.csv.gz  
  inflating: test.csv                
  inflating: test.csv.gz             
  inflating: train.csv               
  inflating: train.csv.gz            


In [9]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Separate features and target variable
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Handling categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

# One-hot encoding for categorical features
# -- Change 1: Add prefix to encoded columns to prevent name collisions --
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, categories='auto')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_features]),
                               columns=encoder.get_feature_names_out(categorical_features))
X_val_encoded = pd.DataFrame(encoder.transform(X_val[categorical_features]),
                             columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded = pd.DataFrame(encoder.transform(test_df[categorical_features]),
                              columns=encoder.get_feature_names_out(categorical_features))

# Scaling numerical features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical_features]),
                              columns=numerical_features) #-- Change 2: Add columns names for scaled features --
X_val_scaled = pd.DataFrame(scaler.transform(X_val[numerical_features]),
                             columns=numerical_features)  #-- Change 2: Add columns names for scaled features --
X_test_scaled = pd.DataFrame(scaler.transform(test_df[numerical_features]),
                              columns=numerical_features) #-- Change 2: Add columns names for scaled features --


# Concatenate encoded and scaled features
X_train_processed = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_val_processed = pd.concat([X_val_scaled, X_val_encoded], axis=1)
X_test_processed = pd.concat([X_test_scaled, X_test_encoded], axis=1)

# -- Change 3: Reset index to avoid potential issues with misaligned indices --
X_train_processed = X_train_processed.reset_index(drop=True)
X_val_processed = X_val_processed.reset_index(drop=True)
X_test_processed = X_test_processed.reset_index(drop=True)

In [16]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 5e-5, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'early_stopping_rounds':10
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_processed, y_train,
              eval_set=[(X_val_processed, y_val)],
              verbose=False)  # Remove early_stopping_rounds here
    return model.best_score

In [17]:
# Create and run an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params

[I 2025-02-12 15:44:54,733] A new study created in memory with name: no-name-3c94dd93-a9e2-464a-86f5-4281fd43297e
[I 2025-02-12 15:44:58,537] Trial 0 finished with value: 25672.669265020413 and parameters: {'lambda': 0.02598249428810433, 'alpha': 0.008742223598216022, 'colsample_bytree': 0.7419622511314616, 'subsample': 0.5896378400877342, 'learning_rate': 0.013999847477215496, 'n_estimators': 750, 'max_depth': 6, 'min_child_weight': 1, 'gamma': 0.40371164441420504}. Best is trial 0 with value: 25672.669265020413.
[I 2025-02-12 15:45:04,574] Trial 1 finished with value: 28942.852334354462 and parameters: {'lambda': 0.07411029941766116, 'alpha': 0.0007880082508848221, 'colsample_bytree': 0.8354287342531477, 'subsample': 0.5153749236227795, 'learning_rate': 0.00540827338300066, 'n_estimators': 704, 'max_depth': 10, 'min_child_weight': 10, 'gamma': 0.594316306906483}. Best is trial 0 with value: 25672.669265020413.
[I 2025-02-12 15:45:11,133] Trial 2 finished with value: 74000.16089308892

In [18]:
# Model Training and Prediction
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train_processed, y_train)

predictions = final_model.predict(X_test_processed)

In [19]:
# Evaluate accuracy on validation set
val_predictions = final_model.predict(X_val_processed)

# Calculate RMSE without 'squared' argument and take the square root manually
rmse = mean_squared_error(y_val, val_predictions)**0.5

print(f"Validation RMSE: {rmse}")

Validation RMSE: 23627.352285010693


In [15]:
#submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': predictions})
#submission_df.to_csv('submission.csv', index=False)