Scratch implementation of blending

In [4]:
from sys import implementation

import pandas as pd
# loading the dataset
data = pd.read_csv('train.csv')
# Display the first few rows and basic information
data_info = data.info()
data_head = data.head()

data_info, data_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

(None,
    Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
 0   1          60       RL         65.0     8450   Pave   NaN      Reg   
 1   2          20       RL         80.0     9600   Pave   NaN      Reg   
 2   3          60       RL         68.0    11250   Pave   NaN      IR1   
 3   4          70       RL         60.0     9550   Pave   NaN      IR1   
 4   5          60       RL         84.0    14260   Pave   NaN      IR1   
 
   LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
 0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
 2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
 3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   
 
   YrSold  SaleType  SaleCondition  SalePrice  

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# Loading the dataset
data = pd.read_csv('train.csv')
# Separate features and target variable
X = data.drop(columns=["SalePrice", "Id"])
y = data["SalePrice"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Preprocessing for numerical data
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ])

# Apply preprocessing to the training and validation data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

X_train_processed.shape, X_val_processed.shape


((1168, 285), (292, 285))

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor

# Train individual base models
base_models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42, n_estimators=100)
}

# Fit each base model and make predictions
base_predictions_train = []
base_predictions_val = []
base_mse = {}

for name, model in base_models.items():
    model.fit(X_train_processed, y_train)
    y_pred_train = model.predict(X_train_processed)
    y_pred_val = model.predict(X_val_processed)
    base_predictions_train.append(y_pred_train)
    base_predictions_val.append(y_pred_val)
    base_mse[name] = mean_squared_error(y_val, y_pred_val)

# Combine base predictions for meta-model training
meta_X_train = np.column_stack(base_predictions_train)
meta_X_val = np.column_stack(base_predictions_val)

# Train a blending model (meta-model)
meta_model = LinearRegression()
meta_model.fit(meta_X_train, y_train)
meta_y_pred_val = meta_model.predict(meta_X_val)

# Calculate MSE for the blended model
blended_mse = mean_squared_error(y_val, meta_y_pred_val)

base_mse, blended_mse


({'LinearRegression': np.float64(868803081.0534712),
  'DecisionTree': np.float64(1555102828.2465754),
  'RandomForest': np.float64(829350970.5694491)},
 np.float64(1555102828.2465744))

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Ensure data is numeric and handle categorical variables
# Assuming X_train and X_val are pandas DataFrames
X_train = X_train.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, setting invalid entries to NaN
X_val = X_val.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling with 0 (can also use imputation)
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

# Convert target variable (y_train, y_val) to numeric if needed
y_train = pd.to_numeric(y_train, errors='coerce').fillna(0)
y_val = pd.to_numeric(y_val, errors='coerce').fillna(0)

# Train three models
model1 = LinearRegression()
model2 = DecisionTreeRegressor(random_state=42)
model3 = RandomForestRegressor(n_estimators=50, random_state=42)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# Predictions on validation set
pred1 = model1.predict(X_val)
pred2 = model2.predict(X_val)
pred3 = model3.predict(X_val)

# Blending (average predictions)
blended_pred = (pred1 + pred2 + pred3) / 3

# Calculate MSE for each model and the blended model
mse_model1 = mean_squared_error(y_val, pred1)
mse_model2 = mean_squared_error(y_val, pred2)
mse_model3 = mean_squared_error(y_val, pred3)
mse_blended = mean_squared_error(y_val, blended_pred)

print("MSE Model for Linear Regression:", mse_model1)
print("MSE Model for SVR:", mse_model2)
print("MSE Model for Decision Tree Regressor:", mse_model3)
print("MSE for Blended Model", mse_blended)

# comparing to see if blending outperforms individual models
if mse_blended < min(mse_model1, mse_model2, mse_model3):
    print("Blending improved the accuracy compared to the individual models")
else:
    print("Blending failed to improve the accuracy")

MSE Model for Linear Regression: 1300424286.328339
MSE Model for SVR: 1579668220.3630137
MSE Model for Decision Tree Regressor: 874893796.5472958
MSE for Blended Model 946755614.3439462
Blending failed to improve the accuracy


Scratch implementation of bagging

In [20]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# loading the dataset
data = pd.read_csv('train.csv')

# Ensure no missing values in the selected columns
data_cleaned = data[['GrLivArea', 'YearBuilt', 'SalePrice']].dropna()

# Select features and target
X = data_cleaned[['GrLivArea', 'YearBuilt']]
y = data_cleaned['SalePrice']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# bagging implementation with corrected signature
def bagging(X_train, y_train, X_val, base_model, n_estimators=15):
    predictions = np.zeros((X_val.shape[0], n_estimators))
    for i in range(n_estimators):
        # Random sampling with replacement
        idx = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_sample, y_sample = X_train.iloc[idx], y_train.iloc[idx]

        # Training the base model
        model = base_model()
        model.fit(X_sample, y_sample)

        # Storing the predictions
        predictions[:, i] = model.predict(X_val)
    # Averaging predictions across all models
    final_prediction = predictions.mean(axis=1)
    return final_prediction

# base model for bagging
base_model = DecisionTreeRegressor

# applying the bagging model
n_estimators = 15
bagged_predictions = bagging(X_train, y_train, X_val, base_model, n_estimators)
# evaluating the bagging model
mse_bagging = mean_squared_error(y_val, bagged_predictions)
# comparing with a single decision tree model
single_model = base_model()
single_model.fit(X_train, y_train)
single_prediction = single_model.predict(X_val)
mse_single = mean_squared_error(y_val, single_prediction)

# outputting the results
print(f'MSE for single Decision Tree: {mse_single}')
print(f'MSE for Bagging with {n_estimators} Decision Trees: {mse_bagging}')

if mse_bagging < mse_single:
    print('Bagging improved the accuracy.')
    print('Bagging did not improve the accuracy.')

MSE for single Decision Tree: 2289084434.8813734
MSE for Bagging with 15 Decision Trees: 1679400820.5945714
Bagging improved the accuracy.
Bagging did not improve the accuracy.


Scratch implementation of stacking

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Load dataset
data = pd.read_csv("train.csv")

# Select explanatory variables and dependent variable
X = data[["GrLivArea", "YearBuilt"]]
y = data["SalePrice"]

# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Stacking implementation
def stacking(X_train, y_train, X_val, base_models, meta_model, n_folds=5):
    # Step 1: Generate blended data for training (Stage 0)
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    base_train_predictions = np.zeros((X_train.shape[0], len(base_models)))  # For Stage 1 training
    base_test_predictions = np.zeros((X_val.shape[0], len(base_models)))    # For final prediction
    for i, model in enumerate(base_models):
        fold_predictions = np.zeros((X_val.shape[0], n_folds))
        for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            # Split the training data into folds
            X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
            X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]

            # Train the base model on (K-1) folds
            model.fit(X_train_fold, y_train_fold)

            # Predict on the validation fold
            base_train_predictions[val_idx, i] = model.predict(X_val_fold)

            # Predict on the test set for this fold
            fold_predictions[:, j] = model.predict(X_val)

        # Average predictions across all folds for the test set
        base_test_predictions[:, i] = fold_predictions.mean(axis=1)

    # Step 2: Train the meta-model (Stage 1)
    meta_model.fit(base_train_predictions, y_train)

    # Step 3: Predict on the test set using the meta-model
    final_predictions = meta_model.predict(base_test_predictions)

    return final_predictions
# Define base models
base_models = [
    LinearRegression(),
    DecisionTreeRegressor(max_depth=5, random_state=42),
    SVR(kernel="linear")
]

# Define meta-model
meta_model = LinearRegression()

# Apply stacking
stacked_predictions = stacking(X_train, y_train, X_val, base_models, meta_model, n_folds=5)

# Evaluate the stacking model
mse_stacking = mean_squared_error(y_val, stacked_predictions)

# Compare with a single model (e.g., Linear Regression)
single_model = LinearRegression()
single_model.fit(X_train, y_train)
single_predictions = single_model.predict(X_val)
mse_single = mean_squared_error(y_val, single_predictions)

# Print results
print(f"Mean Squared Error for Single Model (Linear Regression): {mse_single}")
print(f"Mean Squared Error for Stacking Model: {mse_stacking}")

if mse_stacking < mse_single:
    print("Stacking improved the accuracy!")
else:
    print("Stacking did not improve the accuracy.")

Mean Squared Error for Single Model (Linear Regression): 2495554898.6683216
Mean Squared Error for Stacking Model: 1938950737.699865
Stacking improved the accuracy!
