In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
test = pd.read_csv("/kaggle/input/house-prices/test.csv")
train = pd.read_csv("/kaggle/input/house-prices/train.csv")

In [None]:
df = train.append(test).reset_index(drop=True)

In [None]:
from helpers import *

In [None]:
df.head()

## EDA

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
cat_cols

In [None]:
check_df(df)

In [None]:
for col in cat_cols:
    cat_summary(df,col)

In [None]:
for col in num_cols:
    num_summary(df,col)

In [None]:
for col in cat_cols:
    target_summary_with_cat(df, "SalePrice", col)

In [None]:
for col in num_cols:
     target_summary_with_num(df, "SalePrice", col)

In [None]:
high_correlated_cols(df, plot=False, corr_th=0.80)

In [None]:
df.Fence.value_counts()

In [None]:
def find_correlation(dataframe, numeric_cols, corr_limit=0.60):
    high_correlations = []
    low_correlations = []
    for col in numeric_cols:
        if col == "SalePrice":
            pass
        else:
            correlation = dataframe[[col, "SalePrice"]].corr().loc[col, "SalePrice"]
            print(col, correlation)
            if abs(correlation) > corr_limit:
                high_correlations.append(col + ": " + str(correlation))
            else:
                low_correlations.append(col + ": " + str(correlation))
    return low_correlations, high_correlations



In [None]:
low_corrs, high_corrs = find_correlation(df, num_cols)

### PreProcessing

### Feature Engineering

In [None]:
df.OverallCond.value_counts()

In [None]:
df.columns.sort_values()

In [None]:
df['new_haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['new_has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['new_hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['new_hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

df['new_hasfence'] = np.where(df['Fence'] != None, 1, 0)
df['new_haswooddeck'] = np.where(df['WoodDeckSF']>0, 1, 0)
df['new_hasOpenPorch'] = np.where(df['OpenPorchSF'] > 0, 1, 0)
df['new_hasfireplace'] = np.where(df['Fireplaces'] > 0, 1, 0)

df['new_hasgarage'] = np.where(df['GarageArea']>0, 1, 0)
df['new_salecond'] = np.where(df['SaleCondition'] == "Normal", 1, 0)
df['new_func'] = np.where(df['Functional'] == "Typ", 1, 0)
df['new_garagefin'] = np.where(df['GarageFinish'] == "Fin", 1, 0)

In [None]:
df['new_totalbath'] = df['FullBath'] + (0.5*df["HalfBath"]) + df['BsmtFullBath'] + (0.5*df['BsmtHalfBath'])
df['new_houseage'] = 2021 - df['YearBuilt']
df['new_overallquality'] = df['OverallCond'] + df['OverallQual']
df['new_totalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF'] + df['GarageArea']
df['new_hasgarden'] = np.where((df['LotArea'] - df['1stFlrSF'] > 0), 1, 0)
df['new_totalporchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['3SsnPorch']

In [None]:
df.loc[df["GarageYrBlt"] == 2207, "GarageYrBlt"] = 2005
df["new_area"] = df["GrLivArea"] + df["GarageArea"]
df["new_home"] = df["YearBuilt"]
df.loc[df["new_home"] == df["YearRemodAdd"], "new_home"] = 0
df.loc[df["new_home"] != df["YearRemodAdd"], "new_home"] = 1

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
check_df(df)

### Rare Encoding

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
df = rare_encoder(df, 0.01)

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
useless_cols = [col for col in cat_cols if df[col].nunique() == 1 or
                (df[col].nunique() == 2 and (df[col].value_counts() / len(df) <= 0.01).any(axis=None))]

In [None]:
useless_cols

In [None]:
cat_cols = [col for col in cat_cols if col not in useless_cols]

In [None]:
for col in useless_cols:
    df.drop(col, axis=1, inplace=True)

In [None]:
df.shape

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

### Label Encoding

In [None]:
cat_cols = cat_cols + cat_but_car

In [None]:
df = one_hot_encoder(df, cat_cols, drop_first=True)

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
useless_cols = [col for col in df.columns if df[col].nunique() == 1 or
                (df[col].nunique() == 2 and (df[col].value_counts() / len(df) <= 0.01).any(axis=None))]

In [None]:
useless_cols

In [None]:
len(useless_cols)

In [None]:
for col in useless_cols:
    df.drop(col, axis=1, inplace=True)

In [None]:
df.shape

### Missing Values

In [None]:
missing_values_table(df, na_name=True)

In [None]:
na_cols = [col for col in num_cols if df[col].isnull().sum() > 0 and "SalePrice" not in col]
na_cols

In [None]:
for col in na_cols:
    df.drop(col, axis=1, inplace=True)

### Modeling

In [None]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()].drop("SalePrice", axis=1)

In [None]:
y = np.log1p(train_df['SalePrice'])
X = train_df.drop(["Id", "SalePrice"], axis=1)

In [None]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          ('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
          ("CatBoost", CatBoostRegressor(verbose=False))]

for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")

### Hyperparameter Optimization

In [None]:
cat_model = CatBoostRegressor()
rmse = np.mean(np.sqrt(-cross_val_score(cat_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))

cat_params = {"learning_rate": [0.01, 0.002, 0.1]}

cat_gs_best = GridSearchCV(cat_model,
                            cat_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X, y)

In [None]:
final_modelc = cat_model.set_params(**cat_gs_best.best_params_).fit(X, y)

In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(final_modelc, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

In [None]:
rmse

### Lgbm

In [None]:
lgbm_model = LGBMRegressor()

rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))


lgbm_params = {"learning_rate": [0.01, 0.002, 0.1],
               "n_estimators": [100, 500, 1500, 2500],
               "colsample_bytree": [0.5, 0.7, 1],
               "max_bin": [255, 500, 600] }


lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X, y)


In [None]:
final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)

In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

### Feature Selection

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(20, 20))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

In [None]:
plot_importance(final_model, X, 70)

In [None]:
#plot_importance(final_modelc, X, 70)

In [None]:
X.shape

#### Catboost

In [None]:
feature_imp = pd.DataFrame({'Value': final_modelc.feature_importances_, 'Feature': X.columns}).sort_values(by="Value", ascending=False)
feature_imp.tail(37)

In [None]:
zero_imp_cols = feature_imp[feature_imp["Value"] < 1]["Feature"].values


selected_cols = [col for col in X.columns if col not in zero_imp_cols]
len(selected_cols)

In [None]:
cat_model = CatBoostRegressor(random_state=46)
rmse = np.mean(np.sqrt(-cross_val_score(cat_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))

cat_params = {"learning_rate": [0.01, 0.002, 0.1]}

cat_gs_best = GridSearchCV(cat_model,
                            cat_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X, y)

In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(final_modelc, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

#### lgbm

In [None]:
feature_imp = pd.DataFrame({'Value': final_model.feature_importances_, 'Feature': X.columns}).sort_values(by="Value", ascending=False)
feature_imp.tail(37)

In [None]:
zero_imp_cols = feature_imp[feature_imp["Value"] < 1]["Feature"].values


selected_cols = [col for col in X.columns if col not in zero_imp_cols]
len(selected_cols)

In [None]:
lgbm_model = LGBMRegressor(random_state=46)

lgbm_params = {"learning_rate": [0.01, 0.005],
               "n_estimators": [15000, 20000],
               "colsample_bytree": [0.5, 0.3] }

lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X[selected_cols], y)


final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X[selected_cols], y)

In [None]:
rmse = np.mean(np.sqrt(-cross_val_score(final_model, X[selected_cols], y, cv=5, scoring="neg_mean_squared_error")))
rmse

In [None]:
submission_df = pd.DataFrame()

submission_df['Id'] = test_df["Id"].astype("Int32")
submission_df.head()
y_pred_sub = final_model.predict(test_df[selected_cols])
test_df.head()
y_pred_sub = np.expm1(y_pred_sub)

submission_df['SalePrice'] = y_pred_sub

submission_df.to_csv('submission.csv', index=False)
print(rmse)