<left>
<img src="idc_logo.png" width=100 align="left">
</left>

<H4>Advanced Stats. for Data Science (Spring 2022)</H4>

<H1>Examples from Lecture 10 </H1>

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm

import scipy
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

np.random.seed(2021)
plt.rcParams["figure.figsize"] = (10, 7)
plt.rcParams.update({'font.size': 18})
%matplotlib inline

# Model Selection

### Forward Selection:

In [6]:
import statsmodels.formula.api as smf


def forward_select(data, response, sig_level=.05):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by R-squared
    """
    
    remaining = [c for c in data.columns if not c == response]

    selected = []
    current_score, best_new_score = 0.0, 0.0
    best_Ftest_pval = 0
    prev_R2 = 1
    n = len(data)
    SS_prev = data[response].var() * n
    Ftest_pval = 0

    itr = 0
    while remaining and Ftest_pval < sig_level:
        itr += 1
        R2_with_candidates = []
        SS_with_candidates = {}
        for candidate in remaining:
            varst = ' + '.join(selected + [candidate])
            formula = f"{response} ~ {varst} + 1"
            fitted = smf.ols(formula, data).fit()
            R2 = fitted.rsquared
            SS_candidate = fitted.ssr
            R2_with_candidates.append((R2, candidate))
            SS_with_candidates[candidate] = SS_candidate

        R2_with_candidates.sort()
        best_new_R2, best_candidate = R2_with_candidates.pop()
        SS_new = SS_with_candidates[best_candidate]
        f = (SS_prev - SS_new) / (SS_new / (n - len(selected) + 1))
        Ftest_pval = scipy.stats.f.sf(f, dfn = 1, dfd = (n - len(selected) + 1))
        SS_prev = SS_new
        print(f"=========  Step {itr} ==========")
        print(f"Best Candidate = {best_candidate}")
        print(f"Best R2 = {best_new_R2}")
        print(f"Ftest = {Ftest_pval}")
        print("=============================")
        
        if Ftest_pval < sig_level:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
        else:
            print(f"We do not include {best_candidate}")

    formula = f"{response} ~ {' + '.join(selected)} + 1"
    model = smf.ols(formula, data).fit()
    return model

#### House Prices Dataset

In [11]:
def load_house_prices_data(path = "housing_prices.csv"):
    """
    Args:
    -----
    path:  path to csv file
    
    Load and clean house prices data:
        filters for numeric predictors only
        filters for small lots only
        renames varaibles so that all variable names begins with [a-z]
        applies a variance stabilizing transformation to SalePrice
        removes outliers
        
    """
    
    
    def detect_outliers(df, q=0.01):
        lower_outliers = df < df.quantile(q)
        upper_outliers = df > df.quantile(1-q)
        return lower_outliers | upper_outliers
    
    data_raw = pd.read_csv(path)
    data1 = data_raw[data_raw.LotArea < 15000]  # focus on small lots
    data1 = data1.select_dtypes('number').dropna()
    data1 = data1.rename( # stats model formula cannot have
        # covaraite names starting with non letter
        columns = {'1stFlrSF': 'FirstFlrSF',
                   '2ndFlrSF': 'SecondFlrSF'}) 
                                                              
    variables =[
        'SalePrice',
        'LotArea', 
        'YearBuilt',
         'YrSold', 'MoSold', 
         'Fireplaces', 
        'GarageCars', 'ScreenPorch', 
         'HalfBath', 'FullBath',
         'GrLivArea', 
         'BedroomAbvGr',
        'FirstFlrSF', 
        'SecondFlrSF',
        'TotRmsAbvGrd',
        'LowQualFinSF', 'TotalBsmtSF',
        'LotFrontage', 'WoodDeckSF',
         'OverallQual',
         'OverallCond'
    ]

    data1 = data1.filter(variables).dropna()
    data1['SalePrice'] = np.log(1 + data1['SalePrice'])
    
    mask = detect_outliers(data1, .01).any(1)
    print(f"Masked a fraction of {mask.mean()} of the data due to outliers")
    return data1[~mask]


In [12]:
data = load_house_prices_data()
model = forward_select(data, 'SalePrice')

print(model.model.formula)
print(model.rsquared_adj)


Masked a fraction of 0.14971209213051823 of the data due to outliers
Best Candidate = OverallQual
Best R2 = 0.6689517038250234
Ftest = 2.3393646766581073e-215
Best Candidate = GrLivArea
Best R2 = 0.7418020624311632
Ftest = 8.675999040272048e-50
Best Candidate = YearBuilt
Best R2 = 0.797950202384128
Ftest = 4.306857677256795e-49
Best Candidate = TotalBsmtSF
Best R2 = 0.8325353374261344
Ftest = 5.901427457190616e-38
Best Candidate = OverallCond
Best R2 = 0.8572755538516426
Ftest = 1.549728307056524e-32
Best Candidate = LotArea
Best R2 = 0.8686279342850337
Ftest = 1.2569961366420847e-17
Best Candidate = BedroomAbvGr
Best R2 = 0.872762532327837
Ftest = 1.1186433179551268e-07
Best Candidate = Fireplaces
Best R2 = 0.8749769336795533
Ftest = 8.510687641326578e-05
Best Candidate = GarageCars
Best R2 = 0.8771183724824037
Ftest = 9.785592823090597e-05
Best Candidate = WoodDeckSF
Best R2 = 0.8782798465364912
Ftest = 0.0038919322204984687
Best Candidate = HalfBath
Best R2 = 0.8793763955123856
Ftes