# Requirements

In [1]:
# !pip install pandas numpy matplotlib seaborn xgboost lightgbm scikit-learn ydata-profiling mlxtend

# Importing Dependencies

In [2]:
# Standard Libraries
import os
from math import *
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

# Performance metrics
from model_tuning_utils import *

# Learning Algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Display Options
pd.set_option("display.max_columns", 500)

# Ignore warnings
import warnings
warnings.filterwarnings(action = 'ignore')

# Plot settings
plt.style.use("ggplot")

# EDA

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [4]:
train.shape

(1460, 81)

Here are some libraries for automated visualization reports, ranked based on their features and ease of use:

1. **Sweetviz**: Provides detailed and visually appealing reports with comparison features and data summaries.
   - [Sweetviz GitHub](https://github.com/fbdesignpro/sweetviz)

2. **yData Profiling**: Offers comprehensive EDA with rich statistics and visualizations. It's well-established and widely used.
   - [Pandas Profiling GitHub](https://github.com/pandas-profiling/pandas-profiling)

3. **DataProfiler**: Provides automatic data profiling with visualization capabilities. Useful for quick insights.
   - [DataProfiler GitHub](https://capitalone.github.io/DataProfiler/docs/0.12.0/html/index.html)

4. **Autoviz**: Generates a variety of visualizations automatically, including some advanced plotting features.
   - [Autoviz GitHub](https://github.com/autoviz/)

5. **D-Tale**: Offers interactive, web-based exploration of data with various visualizations.
   - [D-Tale GitHub](https://github.com/man-group/dtale)

6. **Exploratory**: Provides a suite of tools for data visualization and exploration, including automated reporting.
   - [Exploratory Website](https://exploratory.io/)

In [5]:
from ydata_profiling import ProfileReport

profile = ProfileReport(train, title="Profiling Report")

# profile.to_file("ydata_profiling_report.html")

### Key Takeaways

#### High Correlation Alerts
- **1stFlrSF**: Highly correlated with `SalePrice` and 1 other field.
- **2ndFlrSF**: Highly correlated with `BedroomAbvGr` and 2 other fields.
- **3SsnPorch**: Highly correlated with `Alley` and 1 other field.
- **Alley**: Highly correlated with 3SsnPorch and 19 other fields.
- **BedroomAbvGr**: Highly correlated with `2ndFlrSF` and 2 other fields.
- **BldgType**: Highly correlated with `Alley` and 2 other fields.
- **BsmtCond**: Highly correlated with `PoolQC`.
- **GarageArea**: Highly correlated with `GarageCars` and 5 other fields.
- **GarageCars**: Highly correlated with `GarageArea`.

#### Imbalance Alerts
- **Utilities**: Highly imbalanced (99.2%)
- **RoofMatl**: Highly imbalanced (94.4%)
- **Heating**: Highly imbalanced (92.7%)
- **Street**: Highly imbalanced (96.2%)
- **Condition2**: Highly imbalanced (96.4%)
- **LandSlope**: Highly imbalanced (78.8%)
- **BsmtCond**: Highly imbalanced (75.8%)
- **SaleType**: Highly imbalanced (75.3%)
- **BsmtFinType2**: Highly imbalanced (70.1%)
- **Condition1**: Highly imbalanced (71.7%)
- **Functional**: Highly imbalanced (81.9%)
- **KitchenAbvGr**: Highly imbalanced (85.7%)
- **GarageQual**: Highly imbalanced (85.2%)
- **GarageCond**: Highly imbalanced (87.6%)
- **PavedDrive**: Highly imbalanced (69.9%)
- **MiscFeature**: Highly imbalanced (70.7%)
- **ExterCond**: Highly imbalanced (72.8%)
- **CentralAir**: Highly imbalanced (65.3%)
- **RoofStyle**: Highly imbalanced (65.1%)
- **BldgType**: Highly imbalanced (59.4%)
- **MSZoning**: Highly imbalanced (56.9%)

#### Missing Values
- **Alley**: 1369 (93.8%) missing
- **PoolQC**: 1453 (99.5%) missing
- **MiscFeature**: 1406 (96.3%) missing
- **FireplaceQu**: 690 (47.3%) missing
- **MasVnrType**: 872 (59.7%) missing
- **LotFrontage**: 259 (17.7%) missing
- **BsmtQual**: 37 (2.5%) missing
- **BsmtCond**: 37 (2.5%) missing
- **BsmtExposure**: 38 (2.6%) missing
- **BsmtFinType1**: 37 (2.5%) missing
- **BsmtFinType2**: 38 (2.6%) missing
- **GarageType**: 81 (5.5%) missing
- **GarageYrBlt**: 81 (5.5%) missing
- **GarageFinish**: 81 (5.5%) missing
- **GarageQual**: 81 (5.5%) missing
- **GarageCond**: 81 (5.5%) missing

#### Zeros
- **PoolArea**: 1453 (99.5%) zeros
- **3SsnPorch**: 1436 (98.4%) zeros
- **LowQualFinSF**: 1434 (98.2%) zeros
- **MiscVal**: 1408 (96.4%) zeros
- **ScreenPorch**: 1344 (92.1%) zeros
- **BsmtFinSF2**: 1293 (88.6%) zeros
- **EnclosedPorch**: 1252 (85.8%) zeros
- **2ndFlrSF**: 829 (56.8%) zeros
- **WoodDeckSF**: 761 (52.1%) zeros
- **OpenPorchSF**: 656 (44.9%) zeros
- **BsmtFinSF1**: 467 (32.0%) zeros
- **BsmtUnfSF**: 118 (8.1%) zeros
- **GarageArea**: 81 (5.5%) zeros
- **TotalBsmtSF**: 37 (2.5%) zeros

#### Skewed Data
- **MiscVal**: Highly skewed (γ1 = 24.48).

#### `Note`

| Log RMSE Score | Category    |
|----------------|-------------|
| < 0.2          | Excellent   |
| 0.2 - 0.4      | Very Good   |
| 0.4 - 0.6      | Good        |
| 0.6 - 0.8      | Bad         |
| > 0.8          | Very Bad    |

## Data Imputation

In [6]:
# Select categorical columns
category_cols = list(train.select_dtypes(include=['object']).columns)

# Select numerical columns
numerical_cols = list(train.select_dtypes(exclude=['object']).columns)

In [7]:
missing = train.isna().sum() / len(train)

# List of column names with more than 20% missing values + the 'Id' column
columns_to_drop = [col for col, val in zip(missing.index, missing.values) if val > 0.2] + ['Id']

columns_to_drop

['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'Id']

In [8]:
train.duplicated().sum()

0

### Scaling and Encoding

**Scaling Numerical Columns**

StandardScaler: This scaler standardizes features by moving the mean to 0 and scaling to unit variance. This is particularly useful when data is normally distributed (Gaussian distribution). Many machine learning algorithms, such as those that assume normality in the data (e.g., linear regression, logistic regression), perform better with standardized data.

MinMaxScaler: This scaler transforms features by scaling each feature to a given range, usually between 0 and 1. This is useful when data is not normally distributed or has varying scales. MinMaxScaler is commonly used when the distribution is not Gaussian or while using algorithms that do not assume any distribution (e.g., neural networks).

In [9]:
from scipy.stats import skew, kurtosis

skew(train['MasVnrArea'].dropna())

2.6663261001607435

In [10]:
from scipy.stats import skew, kurtosis

# Function to determine scaler based on skewness
def determine_scaler(data):
    skewness = data.apply(lambda x: skew(x.dropna()))
    standardize_cols = skewness[abs(skewness) < 1].index.tolist()
    normalize_cols = skewness[abs(skewness) >= 1].index.tolist()
    return standardize_cols, normalize_cols

# Determine which columns to standardize and which to normalize
standardize_cols, normalize_cols = determine_scaler(train[numerical_cols])

print(f"Columns having Gaussian Distribution: {standardize_cols}")
print(f"Columns having other Distribution: {standardize_cols}")

Columns having Gaussian Distribution: ['Id', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtUnfSF', '2ndFlrSF', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'MoSold', 'YrSold']
Columns having other Distribution: ['Id', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtUnfSF', '2ndFlrSF', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'MoSold', 'YrSold']


**Labelling Categorical Columns**

- Cardinality:
  - OneHotEncoding is generally preferred for categorical variables with a low number of unique values (low cardinality).
  - LabelEncoding or ordinal encoding is preferred for categorical variables with a high number of unique values (high cardinality).
- Ordinality:
  - If the categorical variable has an inherent order (e.g., 'Low', 'Medium', 'High'), LabelEncoding is more appropriate.
  - If there is no inherent order, OneHotEncoding is usually the better choice.
- Frequency Distribution:
  - If the categorical variable has a few dominant categories and many rare categories, it might be beneficial to consider encoding the dominant categories with OneHotEncoding and the rest with LabelEncoding or grouping them into an "Other" category.

---

In [11]:
# # Function to determine encoder based on cardinality and ordinality
# def determine_encoder(data, category_cols, threshold=10):
#     onehot_cols = []
#     label_cols = []
#     for col in category_cols:
#         unique_values = data[col].nunique()
#         if unique_values <= threshold:
#             onehot_cols.append(col)
#         else:
#             label_cols.append(col)
#     return onehot_cols, label_cols

# # Determine which columns to use OneHotEncoder and which to use LabelEncoder
# onehot_cols, label_cols = determine_encoder(train, category_cols, threshold=10)

# print("OneHotEncode columns:", onehot_cols)
# print("LabelEncode columns:", label_cols)

In [12]:
# import textdistance

# arr = train['Functional'].dropna().unique()
# score = [0]

# for i in range(1, len(arr)):
#     score.append(textdistance.Jaccard().similarity(arr[i-1], arr[i]))

# sum(score)

In [13]:
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import LabelEncoder

# def identify_encoding_columns(df):
#     # Initialize lists
#     ohe_cols = []
#     le_cols = []
#     bag_of_words = []
#     columns = df.select_dtypes(include=['object']).columns

#     # Create bag of words for each categorical column
#     for col in columns:
#         bag_of_words.append((' ').join(df[col].dropna().unique()).lower())

#     # Convert bag of words to numeric representation
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform(bag_of_words)
    
#     # Apply KMeans clustering
#     num_clusters = 2
#     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
#     clusters = kmeans.fit_predict(X)
    
#     # Map clusters to column names
#     cluster_map = {i: [] for i in range(num_clusters)}
#     for idx, col in enumerate(columns):
#         cluster_map[clusters[idx]].append(col)
    
#     # Determine which cluster is ordinal
#     # For simplicity, assume the first cluster is ordinal (could be refined)
#     le_cols = cluster_map[0]
#     ohe_cols = cluster_map[1]

#     return le_cols, ohe_cols

# identify_encoding_columns(train)

In [14]:
# df=train
# bag_of_words = ''
# columns = df.select_dtypes(include=['object']).columns

# for col in columns:
#     bag_of_words += (' ').join(df[col].dropna().unique()).lower()
# bag_of_words

---

### `Prototype 1`

- Drop columns having more than 20% missing values.
- Impute others with mean or mode whichever is appropriate.
- Label Encoding all categorical columns.
- No scaling performed.

In [15]:
proto_1 = train.copy(deep=True)

# Drop specified columns
proto_1.drop(columns=columns_to_drop, inplace=True)

# Fill missing values in numeric columns with mean
for j in proto_1.select_dtypes(exclude='object').columns:
    mean = proto_1[j].mean()
    proto_1[j] = proto_1[j].fillna(mean)

# Fill missing values in categorical columns with mode
for j in proto_1.select_dtypes(include='object').columns:
    mode = proto_1[j].mode()[0]
    proto_1[j] = proto_1[j].fillna(mode)
    
    le=LabelEncoder()
    le.fit(proto_1[j])
    proto_1[j] = le.transform(proto_1[j])

# Separate features and target variable
X = proto_1.iloc[:, :-1]
y = proto_1.iloc[:, -1]

### Untuned Models

In [16]:
perf_rf, t_rf = train_and_evaluate_model(RandomForestRegressor(), X, y)

perf_xgb, t_xgb = train_and_evaluate_model(XGBRegressor(), X, y)

perf_gb, t_gb = train_and_evaluate_model(GradientBoostingRegressor(), X, y)

perf_lgbm, t_lgbm = train_and_evaluate_model(LGBMRegressor(force_col_wise=True), X, y)

********************** Training RandomForestRegressor **********************
Model training completed in 0.976118 seconds.
Model performance (log-RMSE) : 0.15198512875588394.


*************************** Training XGBRegressor ***************************
Model training completed in 0.093098 seconds.
Model performance (log-RMSE) : 0.1495470573483127.


******************** Training GradientBoostingRegressor ********************
Model training completed in 0.353987 seconds.
Model performance (log-RMSE) : 0.14343865838703.


************************** Training LGBMRegressor **************************
[LightGBM] [Info] Total Bins 3099
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 68
[LightGBM] [Info] Start training from score 181441.541952
Model training completed in 0.079687 seconds.
Model performance (log-RMSE) : 0.14891125408771433.




### Tuned Models

In [17]:
perf_tuned_rf, t_tuned_rf = train_and_evaluate_model(RandomForestRegressor(), X, y, tuning=True)

perf_tuned_xgb, t_tuned_xgb = train_and_evaluate_model(XGBRegressor(), X, y, tuning=True)

perf_tuned_gb, t_tuned_gb = train_and_evaluate_model(GradientBoostingRegressor(), X, y, tuning=True)

perf_tuned_lgbm, t_tuned_lgbm = train_and_evaluate_model(LGBMRegressor(), X, y, tuning=True)

********************** Training RandomForestRegressor **********************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 3.551792 seconds.
Model training completed in 0.184412 seconds.
Total time taken 3.736204 seconds.
Model performance (log-RMSE) : 0.16163997629864077.


*************************** Training XGBRegressor ***************************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 4.906428 seconds.
Model training completed in 0.05955 seconds.
Total time taken 4.965978 seconds.
Model performance (log-RMSE) : 0.13919980082363254.


******************** Training GradientBoostingRegressor ********************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 10.580477 seconds.
Model training completed in 0.944696 seconds.
Total time taken 11.525173 seconds.
Model performance (log-RMSE) : 0.14818990894393427.


**************

### Stacked Model - Untuned

In [18]:
score, time = train_stacked_ensemble(X, y)

*************************** Training Stack Model ***************************
Stack-model training completed in 2.654375 seconds.
Stack-model performance (log-RMSE) : 0.1392737321609844.




### Stacked Model - Tuned

In [19]:
score, time = train_stacked_ensemble(X, y, base_tuning=True, meta_tuning=True)

*************************** Training Stack Model ***************************
Tuning Base Models
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Hyperparameter tuning completed in 0.101253 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 0.137439 seconds.
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Hyperparameter tuning completed in 0.216549 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 2.434153 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 3.751869 seconds.
Tuning Meta Model
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 11.768035 seconds.
Stack-model training completed in 19.649391 seconds.
Stack-model performance (log-RMSE) : 0.14114543803406038.




In [20]:
score, time = train_stacked_ensemble(X, y, meta_tuning=True)

*************************** Training Stack Model ***************************
Tuning Meta Model
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 14.230575 seconds.
Stack-model training completed in 18.447227 seconds.
Stack-model performance (log-RMSE) : 0.14496442955941108.




`Inference` 

Best performance noticed: Tuned XGBoost and Untuned Stacked Model.

---

### `Prototype 2`

- Drop columns having more than 20% missing values.
- Impute others with mean or mode whichever is appropriate.
- Label Encoding all categorical columns.
- Standard Scaler on all features.

In [21]:
# Scale the data
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

### Untuned Model

In [22]:
perf_rf, t_rf = train_and_evaluate_model(RandomForestRegressor(), X_scaled, y)

perf_xgb, t_xgb = train_and_evaluate_model(XGBRegressor(), X_scaled, y)

perf_gb, t_gb = train_and_evaluate_model(GradientBoostingRegressor(), X_scaled, y)

perf_lgbm, t_lgbm = train_and_evaluate_model(LGBMRegressor(force_col_wise=True), X_scaled, y)

********************** Training RandomForestRegressor **********************
Model training completed in 0.995569 seconds.
Model performance (log-RMSE) : 0.15316510037311146.


*************************** Training XGBRegressor ***************************
Model training completed in 0.06495 seconds.
Model performance (log-RMSE) : 0.1495470573483127.


******************** Training GradientBoostingRegressor ********************
Model training completed in 0.357508 seconds.
Model performance (log-RMSE) : 0.14417744389670972.


************************** Training LGBMRegressor **************************
[LightGBM] [Info] Total Bins 3146
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 68
[LightGBM] [Info] Start training from score 181441.541952
Model training completed in 0.044678 seconds.
Model performance (log-RMSE) : 0.14579440681268316.




### Tuned Model

In [23]:
perf_tuned_rf, t_tuned_rf = train_and_evaluate_model(RandomForestRegressor(), X_scaled, y, tuning=True)

perf_tuned_xgb, t_tuned_xgb = train_and_evaluate_model(XGBRegressor(), X_scaled, y, tuning=True)

perf_tuned_gb, t_tuned_gb = train_and_evaluate_model(GradientBoostingRegressor(), X_scaled, y, tuning=True)

perf_tuned_lgbm, t_tuned_lgbm = train_and_evaluate_model(LGBMRegressor(), X_scaled, y, tuning=True)

********************** Training RandomForestRegressor **********************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 1.474179 seconds.
Model training completed in 0.061129 seconds.
Total time taken 1.535308 seconds.
Model performance (log-RMSE) : 0.15393313787787333.


*************************** Training XGBRegressor ***************************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 7.220603 seconds.
Model training completed in 0.067433 seconds.
Total time taken 7.288036 seconds.
Model performance (log-RMSE) : 0.14056881971638083.


******************** Training GradientBoostingRegressor ********************
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 8.64318 seconds.
Model training completed in 0.32157 seconds.
Total time taken 8.964749999999999 seconds.
Model performance (log-RMSE) : 0.1378399852513932.


*********

### Stacked Model - Untuned

In [24]:
score, time = train_stacked_ensemble(X_scaled, y)

*************************** Training Stack Model ***************************
Stack-model training completed in 2.617637 seconds.
Stack-model performance (log-RMSE) : 0.13779353175978248.




### Stacked Model - Tuned

In [25]:
score, time = train_stacked_ensemble(X_scaled, y, base_tuning=True, meta_tuning=True)

*************************** Training Stack Model ***************************
Tuning Base Models
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Hyperparameter tuning completed in 0.101555 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 0.065757 seconds.
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Hyperparameter tuning completed in 0.217119 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 2.411224 seconds.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 4.493271 seconds.
Tuning Meta Model
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 10.315699 seconds.
Stack-model training completed in 19.46025 seconds.
Stack-model performance (log-RMSE) : 0.13794863955426695.




In [26]:
score, time = train_stacked_ensemble(X_scaled, y, meta_tuning=True)

*************************** Training Stack Model ***************************
Tuning Meta Model
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Hyperparameter tuning completed in 11.997852 seconds.
Stack-model training completed in 14.796512 seconds.
Stack-model performance (log-RMSE) : 0.1407450924451457.




`Inference` 

Best performance noticed: Tuned GradientBoostingRegressor and both versions of the Stacked Model (fully-tuned & untuned).

---