In [117]:
import pandas as pd
import numpy as np

In [118]:
df = pd.read_csv('data/Hitters.csv')

In [119]:
df = df.replace('?', np.nan) # assume ? is na value
full_df = df # df without dropped values, bad stuff should only in salary
df = df.dropna()

In [120]:
qual = ['League', 'Division', 'NewLeague']

for q in qual:
    vals = df[q].unique()
    df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key i

In [188]:
import itertools
import sklearn
import statsmodels.api as sm
import time
from multiprocessing import Pool

In [189]:
y = df['Salary']
x = df.drop(['Salary'], axis=1).astype('float64')
# set the variables you want to use

In [200]:
# subset selection, don't run this it'll take forever
def calc_subset(feature_set, x, y):
    data = x[list(feature_set)]
    data = sm.add_constant(data)
    model = sm.OLS(y, data)
    result = model.fit()
    residuals = y - result.predict(data)
    error = (residuals * residuals).sum()
    return {'model': result, 'features': list(feature_set), 'rss': error}

def split(a, n): # if i were using processes, i could split using this
    k, m = divmod(len(a), n)
    return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def enumerate_subsets(k, x, y, threads=1): # enumerate subsets of size k (Gosper's hack can work, but we use itertools)
    print(f'calculating for subsets of size {k}')
    
    start = time.time()
    combos = list(itertools.combinations(x.columns, k))
    
    # multiprocessing fails in jupyter
    # pool = Pool(processes=3)
    # results = pool.map(calc_subset, combos)
    # pool.close()
    # pool.join()
    
    results = []
    for c in combos:
        results.append(calc_subset(c, x, y))
    
    end = time.time()
    elapsed = end - start
    print(f'done with size {k}, took {elapsed} seconds')
    results_df = pd.DataFrame(results)
    best = results_df.loc[results_df['rss'].argmin()]
    return dict(best)

In [191]:
features = len(x.columns)
features = min(features, 3) # otherwise we take too long
subset_selection = {i: enumerate_subsets(i, x, y) for i in range(1, features + 1)}
# i'm skipping the plotting too, because this takes forever

calculating for subsets of size 1
done with size 1, took 0.038878440856933594 seconds
calculating for subsets of size 2
done with size 2, took 0.3381383419036865 seconds
calculating for subsets of size 3
done with size 3, took 1.9585189819335938 seconds


In [195]:
def forwards_stepwise(predictors, x, y):
    remaining = [p for p in x.columns if p not in predictors]
    results = []
    for p in remaining:
        results.append(calc_subset(predictors + [p], x, y))
    models = pd.DataFrame(results)
    best_model = models.loc[models['rss'].argmin()]
    return dict(best_model)

def backwards_stepwise(predictors, x, y):
    results = []
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(calc_subset(combo, x, y))
    models = pd.DataFrame(results)
    best_model = models.loc[models['rss'].argmin()]
    return dict(best_model)

In [196]:
def forward(x, y):
    models_fwd = pd.DataFrame(columns=['rss', 'model'])

    predictors = []

    for i in range(1, len(x.columns) + 1):
        models_fwd.loc[i] = forwards_stepwise(predictors, x, y)
        predictors = models_fwd.loc[i]['model'].model.exog_names.copy()
        predictors.remove('const')
    
    return models_fwd

def backward(x, y):
    models_bwd = pd.DataFrame(columns=['rss', 'model'])

    predictors = x.columns
    while(len(predictors) > 1):  
        models_bwd.loc[len(predictors) - 1] = backwards_stepwise(predictors, x, y)
        predictors = models_bwd.loc[len(predictors) - 1]['model'].model.exog_names.copy()
        predictors.remove('const')
    
    return models_bwd

In [197]:
models_fwd = forward(x, y)
models_bwd = backward(x, y)

print(models_fwd.loc[7]['model'].summary())
print('---------------')
print(models_bwd.loc[7]['model'].summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.513
Model:                            OLS   Adj. R-squared:                  0.500
Method:                 Least Squares   F-statistic:                     38.41
Date:                Mon, 25 Jul 2022   Prob (F-statistic):           1.50e-36
Time:                        20:00:16   Log-Likelihood:                -1885.4
No. Observations:                 263   AIC:                             3787.
Df Residuals:                     255   BIC:                             3815.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -17.3351     64.535     -0.269      0.7

In [220]:
# masks
train_mask = np.full(len(x), True) 
train_mask[len(x) // 2:len(x)] = False
np.random.shuffle(train_mask)
test_mask = np.logical_not(train_mask)

In [228]:
# datasets
x_train = x[train_mask] 
y_train = y[train_mask]
x_test = x[test_mask]
y_test = y[test_mask]

In [244]:
# using train and test data
fwd_train = forward(x_train, y_train)
fwd_test = []
for i in range(1, len(fwd_train) + 1):
    model = fwd_train.loc[i]['model']
    features = model.model.exog_names.copy()
    features.remove('const')
    eval_data = sm.add_constant(x_test[features])
    residuals = y_test - fwd_train.loc[i]['model'].predict(eval_data)
    error = (residuals * residuals).sum()
    fwd_test.append(error)
fwd_train.insert(2, 'test_error', fwd_test, True)
# we can also perform cross validation in the forward function to make our stepwise better, but i'm not doing that
# not hard, bue tedious
fwd_train

Unnamed: 0,rss,model,test_error
1,19031920.0,<statsmodels.regression.linear_model.Regressio...,17985280.0
2,17060330.0,<statsmodels.regression.linear_model.Regressio...,15348000.0
3,16215850.0,<statsmodels.regression.linear_model.Regressio...,15298010.0
4,15518920.0,<statsmodels.regression.linear_model.Regressio...,16561810.0
5,14971270.0,<statsmodels.regression.linear_model.Regressio...,15851120.0
6,14637240.0,<statsmodels.regression.linear_model.Regressio...,16857860.0
7,14056330.0,<statsmodels.regression.linear_model.Regressio...,15413110.0
8,13644270.0,<statsmodels.regression.linear_model.Regressio...,17346480.0
9,13150730.0,<statsmodels.regression.linear_model.Regressio...,17715160.0
10,12681890.0,<statsmodels.regression.linear_model.Regressio...,16552270.0


In [245]:
best_model = fwd_train.loc[fwd_train['test_error'].argmin()]
print(best_model['model'].summary()) # 2 var model the best?

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.327
Method:                 Least Squares   F-statistic:                     32.64
Date:                Mon, 25 Jul 2022   Prob (F-statistic):           3.50e-12
Time:                        20:23:14   Log-Likelihood:                -957.28
No. Observations:                 131   AIC:                             1921.
Df Residuals:                     128   BIC:                             1929.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         75.7880     75.416      1.005      0.3