In [1]:
import pandas as pd

In [6]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [7]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df_train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [9]:
df_test.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [10]:
combined_data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
combined_encoded = pd.get_dummies(combined_data)

X_train_encoded = combined_encoded.iloc[:len(df_train)]
X_test_encoded = combined_encoded.iloc[len(df_train):]

# Fill NaN values with mean for both X_train_encoded and X_test_encoded
X_train_encoded = X_train_encoded.fillna(X_train_encoded.mean())
X_test_encoded = X_test_encoded.fillna(X_train_encoded.mean())  # Filling with mean of X_train_encoded

# Now continue with the rest of the code
y_train = df_train['SalePrice'].values

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Define the hyperparameter grid for GridSearchCV for each regressor
param_grid_gbr = {
    'max_depth': range(1, 6),
    'n_estimators': range(50, 101, 10),
    'max_features': ['sqrt'],
    'random_state': [42],
    'warm_start': [True]
}

param_grid_rfr = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt'],
    'random_state': [42]
}

param_grid_lr = {
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}

# Initialize GridSearchCV for each regressor
grid_gbr = GridSearchCV(
    GradientBoostingRegressor(),
    param_grid=param_grid_gbr,
    n_jobs=-1,
    cv=2,
    verbose=1
)

grid_rfr = GridSearchCV(
    RandomForestRegressor(),
    param_grid=param_grid_rfr,
    n_jobs=-1,
    cv=2,
    verbose=1
)

grid_lr = GridSearchCV(
    LinearRegression(),
    param_grid=param_grid_lr,
    n_jobs=-1,
    cv=2,
    verbose=1
)

# Fit each regressor on the training data
grid_gbr.fit(X_train_scaled, y_train)
grid_rfr.fit(X_train_scaled, y_train)
grid_lr.fit(X_train_scaled, y_train)

# Find the best regressor with the highest cross-validated score
best_regressor = None
if grid_gbr.best_score_ >= grid_rfr.best_score_ and grid_gbr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_gbr
elif grid_rfr.best_score_ >= grid_gbr.best_score_ and grid_rfr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_rfr
else:
    best_regressor = grid_lr

Fitting 2 folds for each of 30 candidates, totalling 60 fits
Fitting 2 folds for each of 9 candidates, totalling 18 fits
Fitting 2 folds for each of 4 candidates, totalling 8 fits


In [12]:
best_regressor

In [13]:
y_pred = best_regressor.predict(X_test_scaled)

# Get the Id values from the original test dataset
test_ids = df_test['Id']

# Create a DataFrame with Id and SalePrice columns
final_df = pd.DataFrame({'Id': test_ids, 'SalePrice': y_pred})

In [14]:
final_df.head

<bound method NDFrame.head of         Id     SalePrice
0     1461  180921.19589
1     1462  180921.19589
2     1463  180921.19589
3     1464  180921.19589
4     1465  180921.19589
...    ...           ...
1454  2915  180921.19589
1455  2916  180921.19589
1456  2917  180921.19589
1457  2918  180921.19589
1458  2919  180921.19589

[1459 rows x 2 columns]>