In [662]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression,  Ridge
from statsmodels.stats.diagnostic import het_breuschpagan
from sklearn.preprocessing import RobustScaler
from scipy.interpolate import UnivariateSpline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from statsmodels.api import OLS
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np

In [663]:
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    return train_data, test_data

In [664]:
def visualize_data(data, features):
    for feature in features:
        plt.figure(figsize=(10, 4))

        # Boxplot for outlier detection
        plt.subplot(1, 2, 1)
        sns.boxplot(data[feature])
        plt.title(f'Boxplot of {feature}')

        # Histogram for distribution
        plt.subplot(1, 2, 2)
        sns.histplot(data[feature], kde=True)
        plt.title(f'Distribution of {feature}')

        plt.show()

In [665]:
def plot_residuals(residuals, y_pred):
    plt.scatter(y_pred, residuals)
    plt.title('Residuals vs. Predicted Values')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.show()

In [666]:
def normality(residuals):
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title('Normal Q-Q plot')
    plt.show()

In [667]:
def preprocess_data(data, features, imputer):
    data[features] = imputer.transform(data[features])
    return data

In [668]:
def train_model_statsmodels(X, y):
    X = sm.add_constant(X)  # Adding a constant to the model
    model = sm.OLS(y, X).fit(cov_type='HC0')
    return model

In [669]:
def train_model_sklearn(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

In [670]:
def train_model_ridge(X, y):
    model = Ridge()
    model.fit(X, y)
    return model

In [671]:
def train_model_lasso(X, y):
    model = Lasso()
    model.fit(X, y)
    return model

In [672]:
def train_model_enet(X, y):
    model = ElasticNet()
    model.fit(X, y)
    return model

In [673]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1) 

In [674]:
def main():
    # Paths to the datasets
    train_path = 'train.csv'
    test_path = 'test.csv'

    # Load the data
    train_data, test_data = load_data(train_path, test_path)
    
    # Visualize data
    selected_features = ['LotArea','YrSold', 'OverallQual', 'GrLivArea', 'TotalBsmtSF','GarageCars', 'GarageArea', 'MSSubClass', 'YearBuilt']
    #visualize_data(train_data, selected_features)

    # Handling missing values
    imputer = SimpleImputer(strategy='median')
    train_data[selected_features] = imputer.fit_transform(train_data[selected_features])

    # Outlier removal
    train_data = train_data.drop(train_data[(train_data['GrLivArea']>4000) & (train_data['SalePrice']<300000)].index)

    # Apply log transformation to the target variable 'SalePrice'
    train_data['SalePrice'] = np.log1p(train_data['SalePrice'])
    y = train_data['SalePrice']

    # Splitting the train data into X (features) and y (target)
    X = train_data[selected_features]

    # Polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X)
    
    # Splitting the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.2, random_state=0)
    
    # Training the models
    model_sl = train_model_sklearn(X_train, y_train)
    model_r = train_model_ridge(X_train, y_train)
    model_l = train_model_lasso(X_train, y_train)
    model_en = train_model_enet(X_train, y_train)
    model_sm = train_model_statsmodels(X_train, y_train)
    models = AveragingModels(models = (model_sl, model_r, model_l, model_en, model_sm))

    # Evaluating the model
    X_val = sm.add_constant(X_val)  # Adding a constant to the validation data
    y_pred_log = models.predict(X_val)  # Predicted log-transformed prices
    y_pred = np.expm1(y_pred_log)  # Inverse transformation
    r_squared = r2_score(np.expm1(y_val), y_pred)
    residuals = np.expm1(y_val) - y_pred
    plot_residuals(residuals, y_pred)
    normality(residuals)
    print("R-squared value:", r_squared)

    # Preprocessing the test data
    test_data = preprocess_data(test_data, selected_features, imputer)

    # Polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_test = poly.fit_transform(test_data[selected_features])
    
    # Predicting the housing prices for the test data
    X_test = sm.add_constant(X_test)  # Adding a constant to the test data
    predicted_log_prices = models.predict(X_test)  # Predicted log-transformed prices for test data
    predicted_prices = np.expm1(predicted_log_prices)  # Inverse transformation for test data predictions
    ''
    # Saving the predictions
    predicted_prices_df = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': predicted_prices
    })
    predicted_prices_df.to_csv('predicted_housing_prices_statsmodels.csv', index=False)
    ''

In [675]:
if __name__ == "__main__":
    main()

UnboundLocalError: cannot access local variable 'model_sl' where it is not associated with a value