In [61]:
import numpy as np
import pandas as pd

In [62]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [63]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [64]:
train_IDS = data.loc[:, ['Id']]
test_IDS = test_data.loc[:, ['Id']]

y = data.loc[:, ['SalePrice']]

data = data.drop(columns=['Id', 'SalePrice'])
test_data = test_data.drop(columns=['Id'])

In [65]:
data.shape

(1460, 79)

In [66]:
cat_variables = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']
ordinal_var = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']

In [67]:
# Ordinal encoding map for variables in ordinal_var
ordinal_mapping = {
    'ExterQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    'ExterCond': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
    'HeatingQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    'CentralAir': {'Y': 1, 'N': 0},
    'KitchenQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'PavedDrive': {'Y': 2, 'P': 1, 'N': 0},
    'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'NA': 0},
    'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0}
}

# Apply ordinal encoding
for col, mapping in ordinal_mapping.items():
    data[col] = data[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=cat_variables, drop_first=True)
test_data = pd.get_dummies(test_data, columns=cat_variables, drop_first=True)
data, test_data = data.align(test_data, join='inner', axis=1)

In [68]:
data.shape

(1460, 395)

In [69]:
columns_to_keep = data.columns[data.isna().sum() / data.shape[0] <= 0.50]
data = data[columns_to_keep]
test_data = test_data[columns_to_keep]

In [70]:
data.shape

(1460, 393)

In [71]:
for col in data.columns:
    if data[col].isnull().any():  # any null values
        data[col] = data[col].fillna(data[col].median())

for col in test_data.columns:
    if test_data[col].isnull().any():  # any null values
        test_data[col] = test_data[col].fillna(data[col].median())

In [72]:
X = data
X_test_data = test_data

In [73]:
y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [74]:
X_test_data.head()

Unnamed: 0,LotFrontage,LotArea,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.0,11622,6,1961,1961,2,2,3.0,3.0,1.0,...,False,False,False,False,True,False,False,False,True,False
1,81.0,14267,6,1958,1958,2,2,3.0,3.0,1.0,...,False,False,False,False,True,False,False,False,True,False
2,74.0,13830,5,1997,1998,2,2,4.0,3.0,1.0,...,False,False,False,False,True,False,False,False,True,False
3,78.0,9978,6,1998,1998,2,2,3.0,3.0,1.0,...,False,False,False,False,True,False,False,False,True,False
4,43.0,5005,5,1992,1992,3,2,4.0,3.0,1.0,...,False,False,False,False,True,False,False,False,True,False


In [75]:
test_IDS.head()

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465


In [76]:
X.shape

(1460, 393)

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [78]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
Model = []
RMSE = []
R_sq = []
cv = KFold(5)

#Creating a Function to append the cross validation scores of the algorithms
def input_scores(name, model, x, y):
    Model.append(name)
    RMSE.append(np.sqrt((-1) * cross_val_score(model, x, y, cv=cv, 
                                               scoring='neg_mean_squared_error').mean()))
    R_sq.append(cross_val_score(model, x, y, cv=cv, scoring='r2').mean())

In [79]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                              AdaBoostRegressor)

names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression',
         'K Neighbors Regressor', 'Decision Tree Regressor', 
         'Random Forest Regressor', 'Gradient Boosting Regressor',
         'Adaboost Regressor']
models = [LinearRegression(), Ridge(), Lasso(),
          KNeighborsRegressor(), DecisionTreeRegressor(),
          RandomForestRegressor(), GradientBoostingRegressor(), 
          AdaBoostRegressor()]

#Running all algorithms
for name, model in zip(names, models):
    input_scores(name, model, X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, wa

In [80]:
estimator = GradientBoostingRegressor()
clf = AdaBoostRegressor(estimator=estimator, random_state=42)

input_scores('ada + gra', clf, X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [81]:
import xgboost
from xgboost import XGBRegressor

clf = XGBRegressor()
input_scores('xgboost', clf, X, y)

In [82]:
evaluation = pd.DataFrame({'Model': Model,
                           'RMSE': RMSE,
                           'R Squared': R_sq})
print("FOLLOWING ARE THE TRAINING SCORES: ")
evaluation

FOLLOWING ARE THE TRAINING SCORES: 


Unnamed: 0,Model,RMSE,R Squared
0,Linear Regression,239231900.0,-8120004.0
1,Ridge Regression,33444.59,0.8248329
2,Lasso Regression,37103.06,0.7843067
3,K Neighbors Regressor,47475.15,0.6448751
4,Decision Tree Regressor,42863.65,0.6991686
5,Random Forest Regressor,29677.31,0.8605938
6,Gradient Boosting Regressor,28561.58,0.8732946
7,Adaboost Regressor,36935.19,0.7732624
8,ada + gra,27752.51,0.8787343
9,xgboost,30662.23,0.8534442


In [83]:
# Ensure test_IDS is 1d
test_IDS = test_IDS.values.ravel()

In [84]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
import numpy as np

# Base estimator
estimator = GradientBoostingRegressor(random_state=42)

# AdaBoost model
clf = AdaBoostRegressor(estimator=estimator, random_state=42)

# Expanded hyperparameter distribution
param_distributions = {
    'estimator__n_estimators': [50, 100, 150, 200],  
    'estimator__learning_rate': np.linspace(0.01, 0.2, 10), 
    'estimator__max_depth': [3, 5, 7, 10, 15], 
    'n_estimators': [10, 50, 100, 200, 300], 
    'learning_rate': np.linspace(0.5, 2, 10) 
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_distributions,
    n_iter=30,  
    cv=3,  
    scoring='neg_mean_squared_error', 
    verbose=2,
    n_jobs=-1,
    random_state=42 
)

# Fit RandomizedSearchCV
random_search.fit(X, y)

# Output the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Make predictions with the best model
best_model = random_search.best_estimator_
pred = best_model.predict(X_test_data)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=10; total time=  14.7s
[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=10; total time=  14.9s
[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=10; total time=  15.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=10; total time=  18.6s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=10; total time=  21.7s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=15, estimator__n_estimators=100, learning_rate=0.8333333333333333, n_estimators=10; total time=  27.1s
[CV] END estimator__learning_rate=0.01, estimator__max_depth=15, estimator__n_estimators=100, learning_rate=0.8333333333333333, n_estimators=10; total time=  27.3s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=15, estimator__n_estimators=100, learning_rate=0.8333333333333333, n_estimators=10; total time=  28.9s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=10; total time=  19.7s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.0, n_estimators=10; total time=  18.5s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.0, n_estimators=10; total time=  18.0s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=50; total time=  48.4s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.0, n_estimators=10; total time=  16.5s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=50; total time=  50.4s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.8333333333333333, n_estimators=50; total time=  51.1s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=7, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time=  18.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=7, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time=  19.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=7, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time=  23.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.3333333333333333, n_estimators=50; total time=  40.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.3333333333333333, n_estimators=50; total time=  39.1s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.3333333333333333, n_estimators=50; total time=  41.5s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=1.1666666666666665, n_estimators=100; total time= 3.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=1.1666666666666665, n_estimators=100; total time= 3.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.6666666666666665, n_estimators=10; total time=  25.6s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.6666666666666665, n_estimators=10; total time=  23.9s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.6666666666666665, n_estimators=10; total time=  24.9s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=1.1666666666666665, n_estimators=100; total time= 4.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=1.5, n_estimators=300; total time=12.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=1.3333333333333333, n_estimators=200; total time=12.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=1.3333333333333333, n_estimators=200; total time=12.6min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=1.3333333333333333, n_estimators=200; total time=12.3min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=1.5, n_estimators=300; total time=14.3min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=1.5, n_estimators=300; total time=14.4min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.5, n_estimators=200; total time= 3.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.5, n_estimators=200; total time= 3.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=15, estimator__n_estimators=150, learning_rate=2.0, n_estimators=300; total time= 4.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=15, estimator__n_estimators=150, learning_rate=2.0, n_estimators=300; total time= 4.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=15, estimator__n_estimators=150, learning_rate=2.0, n_estimators=300; total time= 4.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=100, learning_rate=0.5, n_estimators=50; total time= 1.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=0.8333333333333333, n_estimators=50; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.5, n_estimators=200; total time= 3.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=0.8333333333333333, n_estimators=50; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.1666666666666665, n_estimators=10; total time=  14.1s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=100, learning_rate=0.5, n_estimators=50; total time= 1.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.1666666666666665, n_estimators=10; total time=  13.3s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=7, estimator__n_estimators=50, learning_rate=1.1666666666666665, n_estimators=10; total time=  13.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=5, estimator__n_estimators=150, learning_rate=0.8333333333333333, n_estimators=50; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=10, estimator__n_estimators=200, learning_rate=1.0, n_estimators=300; total time=14.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.11555555555555555, estimator__max_depth=3, estimator__n_estimators=100, learning_rate=0.5, n_estimators=50; total time= 1.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=0.6666666666666666, n_estimators=10; total time= 1.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=0.6666666666666666, n_estimators=10; total time= 1.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.09444444444444444, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=0.6666666666666666, n_estimators=10; total time= 1.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.07333333333333333, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=10; total time=  30.7s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.07333333333333333, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=10; total time=  38.8s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.07333333333333333, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=10; total time=  40.1s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=2.0, n_estimators=50; total time= 1.3min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=2.0, n_estimators=50; total time= 1.4min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1366666666666667, estimator__max_depth=5, estimator__n_estimators=200, learning_rate=2.0, n_estimators=50; total time= 1.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=10, estimator__n_estimators=200, learning_rate=1.0, n_estimators=300; total time=15.5min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time= 3.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time= 3.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1788888888888889, estimator__max_depth=10, estimator__n_estimators=100, learning_rate=1.0, n_estimators=50; total time= 3.0min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=10, estimator__n_estimators=200, learning_rate=1.0, n_estimators=300; total time=16.7min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.5, n_estimators=300; total time=11.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.5, n_estimators=300; total time=11.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.03111111111111111, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.5, n_estimators=300; total time=11.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=15, estimator__n_estimators=50, learning_rate=2.0, n_estimators=300; total time= 1.2min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=15, estimator__n_estimators=50, learning_rate=2.0, n_estimators=300; total time= 1.3min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=3, estimator__n_estimators=50, learning_rate=2.0, n_estimators=100; total time=  48.5s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=300; total time=10.4min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.1577777777777778, estimator__max_depth=15, estimator__n_estimators=50, learning_rate=2.0, n_estimators=300; total time= 1.6min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=3, estimator__n_estimators=50, learning_rate=2.0, n_estimators=100; total time=  54.8s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=3, estimator__n_estimators=50, learning_rate=2.0, n_estimators=100; total time=  56.2s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=2.0, n_estimators=100; total time= 1.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=2.0, n_estimators=100; total time= 1.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=300; total time=10.9min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=5, estimator__n_estimators=100, learning_rate=1.1666666666666665, n_estimators=300; total time=10.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=10, estimator__n_estimators=150, learning_rate=2.0, n_estimators=100; total time= 2.1min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=15, estimator__n_estimators=200, learning_rate=2.0, n_estimators=200; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=15, estimator__n_estimators=200, learning_rate=2.0, n_estimators=200; total time= 3.1min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=300; total time=  18.3s


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.2, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.0, n_estimators=100; total time= 4.4min


  y = column_or_1d(y, warn=True)


[CV] END estimator__learning_rate=0.01, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=300; total time=  19.7s
[CV] END estimator__learning_rate=0.2, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.0, n_estimators=100; total time= 4.4min
[CV] END estimator__learning_rate=0.2, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.0, n_estimators=100; total time= 4.2min
[CV] END estimator__learning_rate=0.01, estimator__max_depth=3, estimator__n_estimators=200, learning_rate=1.6666666666666665, n_estimators=300; total time=  16.8s
[CV] END estimator__learning_rate=0.2, estimator__max_depth=15, estimator__n_estimators=200, learning_rate=2.0, n_estimators=200; total time= 1.6min
[CV] END estimator__learning_rate=0.052222222222222225, estimator__max_depth=7, estimator__n_estimators=200, learning_rate=0.6666666666666666, n_estimators=200; total time=16.7min
[CV] END estimator__learning_rate=0.05222222222

  y = column_or_1d(y, warn=True)


Best Parameters: {'n_estimators': 100, 'learning_rate': 1.0, 'estimator__n_estimators': 200, 'estimator__max_depth': 3, 'estimator__learning_rate': 0.2}
Best Score: -719369770.2401333


In [85]:
result = pd.DataFrame({'Id': test_IDS, 'SalePrice': pred})
result.to_csv('submission_3.csv', index=False)

In [86]:
import xgboost
from xgboost import XGBRegressor

model = XGBRegressor()

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1]
}

# Set up RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=model, 
                                   param_distributions=param_dist, 
                                   n_iter=100,  # Number of random parameter combinations to test
                                   scoring='neg_mean_squared_error', 
                                   cv=5, 
                                   verbose=1, 
                                   random_state=42, 
                                   n_jobs=-1)

# Fit RandomizedSearchCV on the training data
random_search.fit(X, y)

# Output the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Make predictions with the best model
best_model_2 = random_search.best_estimator_
pred_2 = best_model_2.predict(X_test_data)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
Best Score: -691290427.73334


In [87]:
result = pd.DataFrame({'Id': test_IDS, 'SalePrice': pred_2})
result.to_csv('submission_5.csv', index=False)