In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb


In [2]:
train = pd.read_csv('D:/train.csv')
test = pd.read_csv('D:/test.csv')

In [3]:
# Combine train and test data for preprocessing
all_data = pd.concat([train.drop('SalePrice', axis=1), test])

In [4]:
# Handle missing values
all_data.fillna(np.nan, inplace=True)

In [5]:
# Impute missing values for numerical features
numerical_cols = all_data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
all_data[numerical_cols] = imputer.fit_transform(all_data[numerical_cols])

In [6]:
# Impute missing values for categorical features
categorical_cols = all_data.select_dtypes(exclude=np.number).columns
imputer = SimpleImputer(strategy='most_frequent')
all_data[categorical_cols] = imputer.fit_transform(all_data[categorical_cols])

In [7]:
# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    all_data[col] = label_encoders[col].fit_transform(all_data[col])

In [8]:
# Split back into train and test sets
X_train = all_data[:len(train)]
X_test = all_data[len(train):]
y_train = train['SalePrice']

In [9]:
# Train-validation split
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [10]:
# Train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_split, y_train_split)


In [11]:
# Evaluate on validation set
y_val_pred = rf_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE (Random Forest):", rmse)

Validation RMSE (Random Forest): 28694.968647990914


In [12]:
# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [13]:
# Train the model
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dval, 'validation')], early_stopping_rounds=10, verbose_eval=False)


In [14]:
# Evaluate on validation set
y_val_pred_xgb = xgb_model.predict(dval)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print("Validation RMSE (XGBoost):", rmse_xgb)

Validation RMSE (XGBoost): 0.04144590937986076


In [15]:
# Make predictions using the trained models
test_predictions_rf = rf_model.predict(X_test)
test_predictions_xgb = xgb_model.predict(xgb.DMatrix(X_test))


In [16]:
# Combine predictions from both models (you can also experiment with other ensemble methods)
final_predictions = (test_predictions_rf + test_predictions_xgb) / 2

In [17]:
# Create submission file
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': final_predictions})
submission_df.to_csv('submission.csv', index=False)