In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')


 # 1. Exploratory Data Analysis

## 1.1 Property Characteristics

**Location:**

* **MSZoning:** Zoning code (residential, commercial, industrial)
* **Neighborhood:** Area within Ames where the house is located
* **Street:** Type of road the property faces (highway, cul-de-sac, etc.)
* **Alley:** Does the property have an alley for back access?

**Lot:**

* **LotFrontage:** Length in feet of the street side of the property
* **LotArea:** Total size of the property in square feet
* **LotShape:** How the property is shaped (rectangular, irregular, etc.)
* **LandContour:** How flat or hilly the property is
* **LotConfig:** How the property is laid out (corner lot, inside lot, etc.)
* **LandSlope:** The slope of the land the house sits on

**Size and Living Space:**

* **OverallQual/OverallCond:** General rating of the house's quality and condition (high = good, low = bad)
* **YearBuilt/YearRemodAdd:** Year the house was built and any major renovations done
* **GrLivArea:** Total finished living area above ground in square feet
* **TotalBsmtSF:** Total size of the basement area in square feet (finished and unfinished)
* **1stFlrSF/2ndFlrSF:** Square footage of the first and second floors (if applicable)
* **FullBath/HalfBath:** Number of full and half bathrooms above ground
* **Bedroom/Kitchen:** Number of bedrooms and kitchens
* **TotRmsAbvGrd:** Total number of rooms above ground (excluding bathrooms)
* **Functional:** Overall rating of the house's layout and functionality

**Building Materials and Finishes:**

* **ExterQual/ExterCond:** Quality and condition of the exterior siding and materials
* **Foundation:** Type of foundation the house sits on (concrete slab, basement, etc.)
* **BsmtQual/BsmtCond:** Quality and condition of the basement (unfinished, finished, etc.)
* **RoofStyle/RoofMatl:** Style and material of the roof (shingles, metal, etc.)
* **Exterior1st/Exterior2nd:** Main and secondary exterior siding materials (brick, vinyl, etc.)
* **MasVnrType/MasVnrArea:** Type and amount of masonry veneer (decorative stonework) on the exterior
* **Heating/HeatingQC:** Type of heating system (furnace, boiler, etc.) and its condition
* **CentralAir:** Does the house have central air conditioning?
* **Electrical:** Type of electrical wiring system in the house
* **KitchenQual:** Quality of the kitchen cabinets and finishes
* **GarageQual/GarageCond:** Quality and condition of the garage (if present)
* **Fence:** Does the property have a fence, and if so, what quality is it?

## Amenities:

* **Fireplace/FireplaceQu:** Does the house have a fireplace, and if so, what quality is it?
* **PoolArea/PoolQC:** Does the property have a pool, and if so, what quality is it?
* **WoodDeckSF/OpenPorchSF/EnclosedPorch/ScreenPorch/3SsnPorch:** Square footage of various types of porches (wooden deck, open porch, etc.)

## Sale Related Information:

* **MoSold/YrSold:** Month and year the house was sold
* **SaleType:** Type of sale (traditional, auction, short sale, etc.)
* **SaleCondition:** Condition of the sale (typical, abnormal, distressed, etc.)


## 1.2 Data Preprocessing ##

Before we can do any feature engineering, we need to *preprocess* the data to get it in a form suitable for analysis. The data we used in the course was a bit simpler than the competition data. For the *Ames* competition dataset, we'll need to:
- **Load** the data from CSV files
- **Clean** the data to fix any errors or inconsistencies
- **Encode** the statistical data type (numeric, categorical)
- **Impute** any missing values

We'll wrap all these steps up in a function, which will make easy for you to get a fresh dataframe whenever you need. After reading the CSV file, we'll apply three preprocessing steps, `clean`, `encode`, and `impute`, and then create the data splits: one (`df_train`) for training the model, and one (`df_test`) for making the predictions that you'll submit to the competition for scoring on the leaderboard.

## 1.2.1 Load Data

In [None]:
def load_data():
  #removing outliers per the information file guidance
    
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test


## 1.2.2 Clean Data

#### Rectifying Categorical Features:

- **Exterior2nd Correction:** 
  - Replaced "Brk Cmn" with "BrkComm" to standardize the category naming.

#### Handling Corrupt Garage Year Built Data:
- **GarageYrBlt Correction:** 
  - Replaced corrupt values with the respective year of house construction to maintain data integrity.

#### Improved Clarity in Column Names:
- **Column Name Updates:** 
  - Renamed columns beginning with numbers to improve readability and usability:
    - "1stFlrSF" ⟶ "FirstFlrSF"
    - "2ndFlrSF" ⟶ "SecondFlrSF"
    - "3SsnPorch" ⟶ "Threeseasonporch"

These cleaning steps enhance data consistency, integrity, and ease of interpretation.


In [None]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df = pd.read_csv(data_dir / "train.csv", index_col="Id")

df.Exterior2nd.unique()

In [None]:
def clean(df):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df


## 1.2.3 Categorical Feature Encoding

To properly handle categorical features in the dataset, the following encoding process has been implemented:

#### Nominal (Unordered) Categorical Features:
- The specified nominal features have been converted to categorical data types.
- A "None" category has been added for missing values.

#### Ordinal (Ordered) Categorical Features:
- Ordinal features with predefined levels have been encoded using ordered categorical data types.
- A "None" level has been added for missing values.

This encoding ensures proper treatment of categorical features, facilitating downstream analysis and modeling tasks.


In [None]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]


# The ordinal (ordered) categorical features 

# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add a None level for missing values
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}


def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df


## 1.2.4 Handle Missing Values ###

Handling missing values now will make the feature engineering go more smoothly. We'll impute `0` for missing numeric values and `"None"` for missing categorical values. You might like to experiment with other imputation strategies. In particular, you could try creating "missing value" indicators: `1` whenever a value was imputed and `0` otherwise.

In [None]:
def impute(df):
    # Find numerical and categorical columns (excluding SalePrice)
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).drop(columns=['SalePrice']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Fill missing values for numerical columns with median
    for col in numerical_columns:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)

    # Fill missing values for categorical columns with mode
    for col in categorical_columns:
        mode_value = df[col].mode()[0]  # mode() returns a DataFrame, so we select the first value
        df[col].fillna(mode_value, inplace=True)
    
    return df


## 1.2.5 Load Data

In [None]:
df_train, df_test = load_data()

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    #
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


In [None]:
X = df_train.copy()
y = X.pop("SalePrice")

baseline_score = score_dataset(X, y)
print(f"Baseline score: {baseline_score:.5f} RMSLE")

## 1.2.6 Conclusion

 `I noticed the impact of the preprocessing strategy outlined—specifically, encoding categorical data before handling missing values and explicitly adding a 'None' category for missing values can significantly improve model performance.` 

#### Informative Missingness: 
Treating "None" as a distinct category can highlight the significance of missing data, like a property lacking a garage, offering unique insights that may enhance prediction accuracy.

#### Data Integrity Preservation: 
Encoding missing values as "None" utilizes all data without resorting to deletion or arbitrary imputation, potentially enriching the dataset's representation and improving model outcomes.

# 2. Feature Utility Scores

Feature utility scores, such as `Mutual Information (MI) scores`, quantify the relationship between a feature and the target variable. MI scores guide feature selection by
identifying features with predictive power. They are computed using `make_mi_scores` and visualized with `plot_mi_scores`.

## 2.1 Extracting Important Features using MI Scores

In [None]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
        
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    imputer = SimpleImputer(strategy="median")
    X[num_cols] = imputer.fit_transform(X[num_cols])
    
    # Correct argument name from 'dis_features' to 'discrete_features'
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


X = df_train.copy()
y = X.pop("SalePrice")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

mi_scores = make_mi_scores(X, y)
print(mi_scores)
    

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 9))
mi_scores.plot.bar()
plt.title('Mutual Informtion Scores')
plt.ylabel('MI Scores')
plt.xlabel('Features')
plt.show()

### Analyzing MI Scores
##### Inspect Top Features:

Look at the features with the highest `MI scores`.
These features are the most informative and are likely to be the most beneficial for predictive modeling.

##### Set a Threshold:

You can set a `threshold MI score` based on domain knowledge or experimentation.

Features with MI scores above this threshold are considered good predictors, while those below may be less useful.

In [None]:
print(mi_scores.head(10))

threshold = 0.05

good_scores = mi_scores[mi_scores > threshold]



In [None]:
print(mi_scores[mi_scores > 0.05])

## 2.2 Drop Uninformative Features: 
   - A function `drop_uninformative(df, mi_scores)` is defined to drop features from the DataFrame (`df`) that have MI scores less than or equal to 0.05. 
   - The function takes the DataFrame and MI scores as inputs, identifies the common columns between the DataFrame and MI scores using `intersection`, selects the corresponding MI scores for the common columns, and aligns them with the DataFrame columns. 
   - Features with MI scores greater than 0.05 are retained in the DataFrame using boolean indexing.

In [None]:
def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]


In [None]:
X = df_train.copy()
y = X.pop("SalePrice")
X = drop_uninformative(X, mi_scores)

score_dataset(X, y)

## 2.3 Label Encoding

 `Label encoding is a process used in machine learning to convert categorical data into numerical format by assigning a unique integer to each category.`
 This transformation allows machine learning algorithms to effectively process and analyze categorical variables, especially when working with models that require numerical input.

### 2.3.1 How It Is Different From One-Hot Encoding:
`In label encoding, ordinal relationships signify the inherent order among categories, such as "low," "medium," and "high," represented by integer values (e.g., 0, 1, and 2), while one-hot encoding avoids such assumptions by creating binary columns for each category, maintaining independence.`

In [None]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X


# 3. Create Features

`Creating features involves the process of transforming or deriving new variables from existing data, aiming to enhance the predictive power and interpretability of machine learning models.`

In [None]:
print(df.columns)

In [None]:

def mathematical_transforms(df):
    X = pd.DataFrame()  # dataframe to hold new features
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    # This feature ended up not helping performance
    # X["TotalOutsideSF"] = \
    #     df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + \
    #     df.Threeseasonporch + df.ScreenPorch
    return X


def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg")
    X = X.mul(df.GrLivArea, axis=0)
    return X


def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "Threeseasonporch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return X


def break_down(df):
    X = pd.DataFrame()
    X["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]
    return X


def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return X



def interaction_3(df):
    # Fill missing values with median for LotArea, GarageArea, and PoolArea columns
    df["LotArea"].fillna(df["LotArea"].median(), inplace=True)
    df["GarageArea"].fillna(df["GarageArea"].median(), inplace=True)
    df["PoolArea"].fillna(df["PoolArea"].median(), inplace=True)
    
    X = pd.DataFrame()
    X["LotArea"] = np.sqrt(df["LotArea"])
    X["GarageArea"] = np.sqrt(df["GarageArea"])
    X["PoolArea"] = np.sqrt(df["PoolArea"])
    
    return X


# 4. Skewness of variables

In [None]:
from scipy.stats import skew

def skewed(df, skewness_threshold=0.5, apply_transformation=True):
    skewed_features = df.apply(lambda x: skew(x.dropna()))
    skewed_features = skewed_features[abs(skewed_features) > skewness_threshold]
    skewed_features = skewed_features.index

    # Exclude categorical columns from skewness transformation
    skewed_features = [feature for feature in skewed_features if df[feature].dtype != 'category']

    if apply_transformation:
        for feature in skewed_features:
            df[feature] = np.log1p(df[feature])
    return df

In [None]:
from scipy.stats import skew

sk_before = df.select_dtypes(include=['int64', 'float64']).apply(skew)

sk_int = df.select_dtypes(include=['int64', 'float64']).apply(lambda x: np.log(x + 1))

sk_after = sk_int.apply(skew)

comparison = pd.DataFrame({'Before': sk_before, 'After': sk_after})

print(comparison)



Skewness improved in the features where the skewness value after transformation moved closer to zero compared to before, indicating a distribution that is more symmetric. Based on the provided data, we can identify these improvements by looking for a decrease in the absolute value of skewness for negatively skewed distributions or an increase towards zero for positively skewed distributions, without crossing over to the opposite sign (which would indicate an over-correction). Specifically:

- **MSSubClass**: Skewness decreased slightly, indicating a minor improvement towards a more symmetric distribution.
- **LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd**: These features all show an increase in negative skewness, which might seem counterintuitive at first glance. However, without comparing the magnitude of change relative to the scale and context of each feature's distribution, determining improvement solely based on direction (positive or negative) may not be accurate for these cases.
- **BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF**: All these features related to basement square footage show an increase in negative skewness, suggesting a further deviation from symmetry, hence not an improvement based on the general guideline.
- **LowQualFinSF, 3SsnPorch, ScreenPorch, PoolArea, MiscVal**: The skewness values for these features remained very high, indicating that despite any minor changes, they remain heavily skewed.
- **BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageCars, GarageArea**: These features' skewness changes vary, with some showing slight adjustments towards symmetry but still indicate the need for careful analysis to determine if the changes reflect genuine improvements towards normality or simply minor fluctuations.
- **WoodDeckSF, OpenPorchSF, EnclosedPorch**: These features showed slight changes in skewness, indicating minor adjustments in their distributions.
- **MoSold, YrSold**: Changes in these features are minimal, suggesting only slight adjustments in their distributions' symmetry.

Given the mixed nature of skewness changes, it's crucial to closely examine each feature's distribution and the scale of skewness changes. For this assessment, "improvement" was taken to mean any change that brings the skewness value closer to zero, reflecting a more symmetric distribution. However, for many features listed, the increase in negative skewness actually indicates a move away from symmetry, which would not typically be considered an improvement in skewness. Correct interpretation depends on the goals of the analysis and the specific transformation techniques applied.

# 5. Principal Component Analysis(PCA):

## 5.1 Principal Component Analysis:


PCA (Principal Component Analysis) is a technique used to reduce the dimensionality of high-dimensional datasets while preserving most of the original information. It transforms variables into a new set of uncorrelated variables called principal components, which capture the most important patterns in the data. PCA is essential for dimensionality reduction, noise reduction, feature extraction, visualization, and addressing multicollinearity issues in datasets.

`Princiap Component Analysis` computes the eigenvectors and eigenvalues of the dataset's covariance matrix, selects the top eigenvectors based on eigenvalues, and projects the data onto these components to reduce dimensionality.

In [None]:
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs


In [None]:
def pca_inspired(df):
    X = pd.DataFrame()
    # Composite feature 1: Addition of 'TotalBsmtSF' and 'GrLivArea'   
    X['feature1'] = df.GrLivArea + df.TotalBsmtSF
    # Composite feature 1: Multiplication of 'YearRemodAdd' and 'TotalBsmtSF'
    X['feature2'] = df.YearRemodAdd * df.TotalBsmtSF
  
    return X

def pca_components(df, features):
    X = df.loc[:, features]
    _, X_pca, _ = apply_pca(X)
    return X_pca

pca_features = [
    "GarageArea",
    "YearRemodAdd",
    "TotalBsmtSF",
    "GrLivArea"
]
    

## 5.2 Co-relation Matrix

In [None]:
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method, numeric_only=True),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )


corrplot(df_train, annot=None)

In [None]:

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


## 5.1 Handling Outliers - PCA Application

### 5.1.3 Interquartile Range (IQR) method

The code uses the **Interquartile Range (IQR) method** to handle outliers in numerical columns of a DataFrame. It first calculates the skewness of each column and then identifies outliers based on their deviation from the IQR. Outliers are capped at a certain threshold defined by 1.5 times the IQR above the third quartile or below the first quartile. Finally, the skewness of each column is recalculated to verify the effectiveness of outlier handling.

In [None]:
def indicate_outliers(df):
    for column_name in df.select_dtypes(include=np.number).columns:
        # Calculate skewness
        skewness = df[column_name].skew()
        print("Skewness of column '{}': {:.2f}".format(column_name, skewness))
        
        # Check if skewness is beyond a certain threshold (e.g., 1 or -1)
        if abs(skewness) > 1:
            # Calculate interquartile range (IQR)
            Q1 = df[column_name].quantile(0.25)
            Q3 = df[column_name].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define upper and lower bounds for outlier detection
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Cap outliers
            df[column_name] = np.where(df[column_name] < lower_bound, lower_bound, df[column_name])
            df[column_name] = np.where(df[column_name] > upper_bound, upper_bound, df[column_name])
            
            print("Outliers handled for column '{}'.".format(column_name))
            
           
    return df



# 6. Holdout Validation

1. Split the dataset into two subsets: a training set and a validation set.
2. Allocate approximately 80% of the data to the training set and 20% to the validation set.
3. Train the machine learning model using the training set.
4. Evaluate the model's performance using the validation set.
5. Assess the model's performance metrics, such as accuracy, precision, recall, or F1 score.
6. Adjust the model's hyperparameters based on the validation performance.
7. Repeat the training and evaluation process with the updated hyperparameters.
8. Continue iterating until satisfactory performance is achieved on the validation set.
9. Ensure that the model generalizes well to unseen data by validating its performance on the validation set.

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop("SalePrice")
    mi_scores = make_mi_scores(X, y)

    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("SalePrice")
        X = pd.concat([X, X_test])

    # Lesson 2 - Mutual Information
    X = drop_uninformative(X, mi_scores)

    # Lesson 3 - Transformations
    X = X.join(mathematical_transforms(X))
    
    X = X.join(interaction_3(X), lsuffix='_existing', rsuffix='_new')


    X = X.join(interactions(X))
    X = X.join(counts(X))
    # X = X.join(break_down(X))
    X = X.join(group_transforms(X))

    # Lesson 4 - Clustering
    # X = X.join(cluster_labels(X, cluster_features, n_clusters=20))
    # X = X.join(cluster_distance(X, cluster_features, n_clusters=20))

    # Lesson 5 - PCA
    X = X.join(pca_inspired(X))
    #X = X.join(pca_components(X, pca_features))
    X = indicate_outliers(X)

    X = label_encode(X)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    # Lesson 6 - Target Encoder
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X = X.join(encoder.fit_transform(X, y, cols=["MSSubClass"]))
    if df_test is not None:
        X_test = X_test.join(encoder.transform(X_test))

    if df_test is not None:
        return X, X_test
    else:
        return X

df = df[df.GrLivArea < 4000]  #removing outliers per the information file guidance

df_train, df_test = load_data()
X_train = create_features(df_train)
y_train = df_train.loc[:, "SalePrice"]

score_dataset(X_train, y_train)

In [None]:
print(X.info())

# 7. Model Comparison

## 7.1 Gradient Boosted Decision Trees (GBDTs)


### XGBoost: eXtreme Gradient Boosting

### LightGBM: Light Gradient Boosting Machine

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import lightgbm as lgb

# Assuming df_train contains your training data
X_train = create_features(df_train)
y_train = df_train['SalePrice']

# Define numerical and categorical features
numerical_features = df_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df_train.select_dtypes(include=['object']).columns
# Define XGBoost parameters
xgb_params = {
    'max_depth': 3,
    'learning_rate': 0.001,
    'n_estimators': 6000,
    'min_child_weight': 1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'reg_alpha': 0.5,
    'reg_lambda': 1.0,
    'num_parallel_tree': 1
}

# Initialize XGBoost model
xgb = XGBRegressor(**xgb_params)

# Score dataset using XGBoost
score_dataset(X_train, y_train, xgb)

# Define LightGBM parameters
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# Train the LightGBM model
model = lgb.LGBMRegressor(**lgb_params)  # Assuming you have already obtained the best_params
model.fit(X_train, y_train)

# Score dataset using cross-validation
score_dataset(X_train, y_train, model)


In [None]:
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, "SalePrice"]

xgb = lgb.LGBMRegressor(**lgb_params)

# XGB minimizes MSE, but competition loss is RMSLE
# So, we need to log-transform y to train and exp-transform the predictions
xgb.fit(X_train, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

## 7.2 Conclusion

**In this project, I opted for LightGBM (LGBM) and found it delivered better results compared to XGBoost (XGBoost), particularly for my dataset of 3000 records. Despite LGBM's known efficiency with categorical features, I still applied preprocessing steps, specifically label encoding, to ensure consistency across models. LGBM's inherent ability to handle categorical data efficiently and its faster training times contributed significantly to the improved performance. This approach allowed for quicker iteration and more extensive hyperparameter tuning, giving LGBM an edge.
With XGBoost, I achieved a score of `0.13374` on the public leaderboard. However, with LightGBM, the score improved to `0.12824`, indicating its superior predictive capability in this particular context.
Although XGBoost is a powerful tool, requiring additional preprocessing and more detailed tuning efforts might have limited its performance in my specific case, leading to LGBM's superior outcome.**
