In [593]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [594]:
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    return train_data, test_data

In [595]:
def visualize_data(data, features):
    for feature in features:
        plt.figure(figsize=(10, 4))

        # Boxplot for outlier detection
        plt.subplot(1, 2, 1)
        sns.boxplot(data[feature])
        plt.title(f'Boxplot of {feature}')

        # Histogram for distribution
        plt.subplot(1, 2, 2)
        sns.histplot(data[feature], kde=True)
        plt.title(f'Distribution of {feature}')

        plt.show()

In [596]:
def remove_outliers(data, features, threshold=3):
    z_scores = stats.zscore(data[features])
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < threshold).all(axis=1)
    return data[filtered_entries]

In [597]:
def preprocess_data(data, features, imputer):
    data[features] = imputer.transform(data[features])
    return data

In [598]:
def train_model_statsmodels(X, y):
    X = sm.add_constant(X)  # Adding a constant to the model
    model = sm.OLS(y, X).fit()
    return model

In [599]:
def train_model_sklearn(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

In [600]:
def main():
    # Paths to the datasets
    train_path = 'train.csv'
    test_path = 'test.csv'

    # Load the data
    train_data, test_data = load_data(train_path, test_path)
    
    # Visualize data
    selected_features = ['LotArea', 'OverallQual', 'GrLivArea', 'TotalBsmtSF','GarageCars', 'GarageArea', 'MSSubClass', 'YearBuilt', 'YearRemodAdd']
    #visualize_data(train_data, selected_features)

    # Remove outliers from train data
    train_data = remove_outliers(train_data, selected_features)

    # Handling missing values
    imputer = SimpleImputer(strategy='mean')
    train_data[selected_features] = imputer.fit_transform(train_data[selected_features])

    # Apply log transformation to the target variable 'SalePrice'
    y = np.log1p(train_data['SalePrice'])

    # Splitting the train data into X (features) and y (target)
    X = train_data[selected_features]

    # Splitting the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

    # Training the model with statsmodels
    model = train_model_statsmodels(X_train, y_train)

    # Train a similar model using sklearn
    model_sklearn = train_model_sklearn(X_train, y_train)
    
    # Perform cross-validation
    scores = cross_val_score(model_sklearn, X, y, cv=16)  # 'cv' is the number of folds

    # Print the results
    print("Cross-Validation Scores:", scores)
    print("Average Score:", np.mean(scores))
    
    # Evaluating the model
    X_val = sm.add_constant(X_val)  # Adding a constant to the validation data
    y_pred_log = model.predict(X_val)  # Predicted log-transformed prices
    y_pred = np.expm1(y_pred_log)  # Inverse transformation
    r_squared = r2_score(np.expm1(y_val), y_pred)
    print("R-squared value:", r_squared)
    
    # Preprocessing the test data
    test_data = preprocess_data(test_data, selected_features, imputer)
    
    # Predicting the housing prices for the test data
    X_test = sm.add_constant(test_data[selected_features])  # Adding a constant to the test data
    predicted_log_prices = model.predict(X_test)  # Predicted log-transformed prices for test data
    predicted_prices = np.expm1(predicted_log_prices)  # Inverse transformation for test data predictions
    '''
    # Saving the predictions
    predicted_prices_df = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': predicted_prices
    })
    predicted_prices_df.to_csv('predicted_housing_prices_statsmodels.csv', index=False)
    '''

In [601]:
if __name__ == "__main__":
    main()

Cross-Validation Scores: [0.86090301 0.83666934 0.87552109 0.91423833 0.84042657 0.85808543
 0.88026725 0.81791043 0.84328562 0.85388187 0.85550564 0.85885555
 0.84746304 0.84169701 0.83331362 0.84597192]
Average Score: 0.8539997317488008
R-squared value: 0.8736738683777534
