In [111]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex

house = pd.read_csv("train.csv")
house_test = pd.read_csv("test.csv")

house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
colors = plt.cm.Set1.colors 
color_hex = to_hex(colors[3])
house.describe().T.style.bar(subset=['mean'], color=color_hex).background_gradient(subset=['std', '50%', 'max'])

In [None]:
color_hex1 = to_hex(colors[7])
sns.distplot(house['SalePrice'],color=color_hex1)
plt.axvline(x=house['SalePrice'].mean(), color=color_hex1, linestyle='--', linewidth=2)
plt.title('Sales');

<div style="font-family: 'Comic Sans MS'; font-size: 18px; color:Blue;">
Missing values
</div>

In [None]:
color_hex2= to_hex(colors[4])
missing = house.isnull().sum()
missing = missing[missing > 0]
missing = missing.sort_values(ascending = False)
missing.plot.bar(color=color_hex2)
plt.title('Missing Data');

<div style="font-family: 'Comic Sans MS'; font-size: 18px; color:Blue;">
‣ We can see that some of the data got more than 50% missing values which is not good for training model.<br> ‣ I will drop those values and for those who are less than 50% we will try to fill them.
</div>

In [None]:
y = house['SalePrice']
house = house.drop(['Id','Alley','PoolQC','Fence','MiscFeature','SalePrice'],axis=1)
house_test = house_test.drop(['Id','Alley','PoolQC','Fence','MiscFeature'],axis=1)

In [None]:
num_cols = [col for col in house.columns if house[col].dtype in ['float64','int64']]
cat_cols = [col for col in house.columns if house[col].dtype not in ['float64','int64']]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

house[num_cols] = SimpleImputer().fit_transform(house[num_cols])
house_test[num_cols] = SimpleImputer().fit_transform(house_test[num_cols])

house[num_cols] = MinMaxScaler().fit_transform(house[num_cols])
house_test[num_cols] = MinMaxScaler().fit_transform(house_test[num_cols])

In [None]:
columns_None = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','GarageType','GarageFinish','GarageQual','FireplaceQu','GarageCond',
                'MasVnrType','Electrical','MSZoning','Utilities','Functional','Exterior2nd','KitchenQual','Exterior1st','SaleType']
house[columns_None] = house[columns_None].fillna('none')
house_test[columns_None] = house_test[columns_None].fillna('none')

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(house[cat_cols])
encoded_cols = list(encoder.get_feature_names(cat_cols))
house[encoded_cols] = encoder.transform(house[cat_cols])


In [None]:
house_test[encoded_cols] = encoder.transform(house_test[cat_cols])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train,X_test,y_train,y_test = train_test_split(house[num_cols + encoded_cols],
                                                 y,
                                                 test_size=0.25,
                                                 random_state=42)

In [None]:
!pip install xgboost
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
models = {
    'SVR':SVR(),
    'XGBRegressor':XGBRegressor(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet(),
    'SGDRegressor':SGDRegressor(),
    'BayesianRidge':BayesianRidge(),
    'LinearRegression':LinearRegression(),
    'RandomForestRegressor':RandomForestRegressor()
}

In [None]:
model_results = []
model_names = []

for name,model in models.items():
    a = model.fit(X_train,y_train)
    predicted = a.predict(X_test)
    score = np.sqrt(mean_squared_error(y_test, predicted))
    model_results.append(score)
    model_names.append(name)
  
    df_results = pd.DataFrame([model_names,model_results])
    df_results = df_results.transpose()
    df_results = df_results.rename(columns={0:'Model',1:'RMSE'}).sort_values(by='RMSE',ascending=False)
    
print(df_results)

<div style="font-family: 'Comic Sans MS'; font-size: 18px; color:Blue;">
‣ We tried multiple models but the best model was with lowest RMSE "XGB Regressor".
    <br>‣ Now the question arise how we can further reduce the error? we can reduce it by using hyper parameter tuning, but how to decide ? how to guess the best parameters?
    ‣ Lets do it in our next step.
</div>

In [None]:
!pip install optuna
import optuna

inputs_df = house.drop('SalePrice', axis=1)  
targets = house['SalePrice']

def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(
        inputs_df[num_cols + encoded_cols], targets, test_size=0.25, random_state=42)

    param = {
        'tree_method': 'gpu_hist',  # Use the GPU to speed up training (if available)
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.009, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': trial.suggest_categorical('n_estimators', [1000, 2000, 3000, 4000]),
        'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17, 20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48, 2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    model = XGBRegressor(**param)
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
    preds_valid = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds_valid, squared=False)
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

study.best_params



In [None]:
best_params = {'lambda': 3.559040735218393,
 'alpha': 0.25772549522868987,
 'colsample_bytree': 0.6,
 'subsample': 0.5,
 'learning_rate': 0.014,
 'n_estimators': 4000,
 'max_depth': 11,
 'random_state': 24,
 'min_child_weight': 3}

In [None]:
XGBR = XGBRegressor(**best_params)

XGBR.fit(X_train, y_train)
predicted = XGBR.predict(X_test)
print(f'Root Mean Square Error test = {mean_squared_error(y_test, predicted,squared=False)}')

In [None]:
#This is the parameter Iam using on my submission after multiple tries
XGBR = XGBRegressor(booster='gbtree', 
                    colsample_bylevel=1,
                    colsample_bynode=1, 
                    colsample_bytree=0.6,
                    gamma=0,
                    importance_type='gain', 
                    learning_rate=0.01, 
                    max_delta_step=0,
                    max_depth=4, 
                    min_child_weight=1.5, 
                    n_estimators=2400,
                    n_jobs=1, 
                    nthread=None, 
                    objective='reg:linear',
                    reg_alpha=0.6, 
                    reg_lambda=0.6, 
                    scale_pos_weight=1, 
                    silent=None, 
                    subsample=0.8, 
                    verbosity=1)

XGBR.fit(X_train,y_train)
predicted = XGBR.predict(X_test)
print(f'Root Mean Square Error test = {mean_squared_error(y_test, predicted,squared=False)})


In [None]:
test_preds = XGBR.predict(house_test[num_cols + encoded_cols])

In [None]:
submission_df = pd.read_csv("sample_submission.csv")
submission_df['SalePrice'] = test_preds
submission_df.to_csv("my_submission.csv", index=False)
