In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.pipeline import Pipeline
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna
from objectiveOpt import studier

In [2]:
# Load datasets
train = pd.read_csv('data/train.csv')
train = train.drop('Id', axis=1)
test = pd.read_csv('data/test.csv')
test_ids = test.pop('Id')

In [3]:
# Fill missing values for specific columns
train.fillna({'LotFrontage': 0, 'MiscVal': 0}, inplace=True)

In [4]:
# Feature engineering
train['year_qual'] = train['YearBuilt'] * train['OverallQual']
train['year_r_qual'] = train['YearRemodAdd'] * train['OverallQual']
train['qual_bsmt'] = train['OverallQual'] * train['TotalBsmtSF']
train['qual_fl'] = train['OverallQual'] * train['1stFlrSF']
train['qual_gr'] = train['OverallQual'] * train['GrLivArea']
train['qual_gar_area'] = train['OverallQual'] * train['GarageArea']
train['qual_gar_cars'] = train['OverallQual'] * train['GarageCars']
train['qual_bath'] = train['OverallQual'] * train['FullBath']
train['qual_bed'] = train['OverallQual'] * train['BedroomAbvGr']
train['qual_kit'] = train['OverallQual'] * train['KitchenAbvGr']
train['qual_fire'] = train['OverallQual'] * train['Fireplaces']
train['qual_wd'] = train['OverallQual'] * train['WoodDeckSF']
train['qual_op'] = train['OverallQual'] * train['OpenPorchSF']
train['qual_en'] = train['OverallQual'] * train['EnclosedPorch']
train['qual_3s'] = train['OverallQual'] * train['3SsnPorch']
train['qual_scr'] = train['OverallQual'] * train['ScreenPorch']
train['qual_pool'] = train['OverallQual'] * train['PoolArea']
train['qual_mo'] = train['OverallQual'] * train['MoSold']
train['qual_yr'] = train['OverallQual'] * train['YrSold']

In [5]:
# Extract numerical features and calculate correlation with SalePrice
numerical_features = train.select_dtypes(include=[np.number])
correlation_matrix = numerical_features.corr()
high_corr_features = correlation_matrix.index[abs(correlation_matrix["SalePrice"]) > 0.8].tolist()
high_corr_data = train[high_corr_features]

In [6]:
# Separate target variable and concatenate datasets for imputation
train_target = train.pop('SalePrice')
combinedSet = pd.concat([train, test], axis=0, ignore_index=True)

In [7]:
# Separate transformed data back into train and test
train = combinedSet.iloc[:len(train_target)].copy()
test = combinedSet.iloc[len(train_target):].copy()

In [8]:
# Select categorical and numerical columns
categorical_cols = combinedSet.select_dtypes(include=['object']).columns.tolist()
numerical_cols = combinedSet.select_dtypes(include=['number']).columns.tolist()

In [9]:
# Define the transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=0))
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [11]:
# Apply the preprocessor to the data
train_processed = preprocessor.fit_transform(train)
test_processed = preprocessor.transform(test)

In [12]:
# Split processed training data for validation
X_train, X_valid, y_train, y_valid = train_test_split(train_processed, train_target, test_size=0.2, random_state=723894)

In [13]:
params, value = studier(1000, 12)
print(params)

[I 2024-11-14 20:08:20,502] A new study created in memory with name: We the best house
[W 2024-11-14 20:09:57,610] Trial 9 failed with parameters: {} because of the following error: InvalidParameterError("The 'random_state' parameter of train_test_split must be an int in the range [0, 4294967295], an instance of 'numpy.random.mtrand.RandomState' or None. Got 723894576082367 instead.").
Traceback (most recent call last):
  File "/home/abog/anaconda3/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/abog/UpperDivision/PycharmProjects/Project4House/objectiveOpt.py", line 106, in objective
    X_train, X_valid, y_train, y_valid = train_test_split(train_processed, train_target, test_size=0.2,
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/abog/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validat

InvalidParameterError: The 'random_state' parameter of train_test_split must be an int in the range [0, 4294967295], an instance of 'numpy.random.mtrand.RandomState' or None. Got 723894576082367 instead.

In [None]:
final_model = XGBRegressor(**params)
final_model.fit(X_train, y_train)
predictions = final_model.predict(test_processed)
final_pred = pd.DataFrame(predictions)
submission = pd.concat([test_ids, final_pred], axis=1)
submission.to_csv('submissions/submission.csv', index=False)