# Prerequisites

In [None]:
!pip install opendatasets pandas-profiling optuna catboost --quiet --upgrade

In [None]:
import os
import optuna
import logging
import catboost
import pandas as pd
import opendatasets as od

from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from ydata_profiling import ProfileReport
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
data_url= 'https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data'
od.download(data_url)
os.chdir('house-prices-advanced-regression-techniques')

# forming datasets
raw_train_dataset = pd.read_csv('train.csv')
raw_test_dataset  = pd.read_csv('test.csv')

Downloading house-prices-advanced-regression-techniques.zip to ./house-prices-advanced-regression-techniques


100%|██████████| 199k/199k [00:00<00:00, 42.7MB/s]


Extracting archive ./house-prices-advanced-regression-techniques/house-prices-advanced-regression-techniques.zip to ./house-prices-advanced-regression-techniques





# Preprocessing

## Data Visualization

Selecting only those columns which have atleast 50% non-null values.

In [None]:
usable_columns= raw_train_dataset.columns[[x<=50 for x in (raw_train_dataset.isna().sum()*100/len(raw_train_dataset)).round(2)]]

Using `usable_columns` list for creating the new dataset.

In [None]:
train_dataset= raw_train_dataset[usable_columns]
test_dataset=  raw_test_dataset[usable_columns[:-1]] #discounting SalePrice column from usable_columns

Generating profile report to understand the data for exercising the best imputation technique.

In [None]:
#data_profile_report= ProfileReport(train_dataset)
#data_profile_report.to_file('data_profile_report.html')

conclusion:
- best numeric imputation technique: median
- best categoric imputation technique: most_frequent

## Column Segregation

In [None]:
train_numerical_columns=  train_dataset.select_dtypes(exclude= ['object']).columns
test_numerical_columns=   test_dataset.select_dtypes(exclude= ['object']).columns

categorical_columns= train_categorical_columns= test_categorical_columns= train_dataset.select_dtypes(include= ['object']).columns

**Note:** `BsmtQual`, `BsmtCond`, `BsmtExposure`, `BsmtFinType1`, `BsmtFinType2`, `GarageFinish`, `GarageQual`, `GarageCond` have NA value which means that feature is `absent` in the house. Hence, filling the NA values with `absent` label.

In [None]:
train_dataset[train_categorical_columns]= train_dataset[train_categorical_columns].fillna('absent')
test_dataset[test_categorical_columns]=   test_dataset[test_categorical_columns].fillna('absent')

## Numerical and Categorical Imputation

In [None]:
# this function identifies the column with missing values and impute them using the provided imputation technique
def impute_columns(dataset, columns, imputation_technique):
  columns_with_missing_data= [column for column in columns if dataset[column].isna().sum() != 0]
  imputer= SimpleImputer(strategy= imputation_technique)

  for column in columns_with_missing_data:
    missing_rows= dataset[column].isna()
    imputed_values= imputer.fit_transform(dataset[[column]])
    dataset[column] = imputed_values

In [None]:
# imputation of train_dataset
impute_columns(train_dataset, train_numerical_columns, 'median')
impute_columns(train_dataset, train_categorical_columns, 'most_frequent')

# imputation of test_dataset
impute_columns(test_dataset, test_numerical_columns, 'median')
impute_columns(test_dataset, test_categorical_columns, 'most_frequent')

### Verifying Imputation

In [None]:
def check_null(dataset):
  return dataset.isna().sum()[dataset.isna().sum() != 0]

In [None]:
check_null(train_dataset), check_null(test_dataset)

(Series([], dtype: int64), Series([], dtype: int64))

## Label Encoding

In [None]:
# this function helps in label encoding the unique values of the concerned column after setting it in ascending order on the basis of mean SalePrice
def label_encode(column, display= False):
  encoding_order= train_dataset.groupby(column)['SalePrice'].mean().round(2).sort_values().index.to_list()

  def categorize_column(dataset, column, encoding_order, display):
    label_dictionary= {label: index for index, label in enumerate(encoding_order)}
    if display == True:
      print(label_dictionary)

    dataset[column]= dataset[column].map(label_dictionary)

  categorize_column(train_dataset, column, encoding_order, display)
  categorize_column(test_dataset, column, encoding_order, display)

In [None]:
# encoding all the categorical columns
for column in categorical_columns:
  label_encode(column)

In [None]:
# encoding two numerical columns
label_encode('OverallQual', True)
print()
label_encode('OverallCond', True)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9}
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9}


# Model Training

In [None]:
# dividing the train_dataset into features and target
features, target = train_dataset.iloc[:, :-1].copy(), train_dataset.iloc[:, -1].copy()

In [None]:
# creating objective function for optuna hypertuning
def objective(trial):
    parameter= {'iterations': trial.suggest_int('iterations', 10, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0),
                'depth': trial.suggest_int('depth', 1, 16),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 10.0),
                'border_count': trial.suggest_int('border_count', 1, 255),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1.0),
                'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10)}

    catboost_model = CatBoostRegressor(**parameter, random_seed=42)#, verbose=0)

    score = cross_val_score(catboost_model, features, target, cv=5, scoring='neg_mean_squared_error')
    return -score.mean()

# create an optuna study and optimize the hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

best_parameters= study.best_params
catboost_model = CatBoostRegressor(**best_parameters)

In [None]:
catboost_model.fit(features, target)

y_test_pred= catboost_model.predict(test_dataset)

In [None]:
submission= pd.read_csv('sample_submission.csv')

In [None]:
submission.SalePrice= y_test_pred
submission.set_index('Id', inplace= True)

In [None]:
submission.to_csv('First-Submission.csv')