In [117]:
import pandas as pd
import numpy as np

In [118]:
df = pd.read_csv('data/Hitters.csv')

In [119]:
df = df.replace('?', np.nan) # assume ? is na value
full_df = df # df without dropped values, bad stuff should only in salary
df = df.dropna()

In [120]:
qual = ['League', 'Division', 'NewLeague']

for q in qual:
    vals = df[q].unique()
    df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key in enumerate(vals)})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, [q]] = df[q].map({key: index for index, key i

In [188]:
import itertools
import sklearn
import statsmodels.api as sm
import time
from multiprocessing import Pool

In [189]:
y = df['Salary']
x = df.drop(['Salary'], axis=1).astype('float64')
# set the variables you want to use

In [190]:
# subset selection, don't run this it'll take forever
def calc_subset(feature_set, x, y):
    data = x[list(feature_set)]
    data = sm.add_constant(data)
    model = sm.OLS(y, data)
    result = model.fit()
    residuals = y - result.predict(data)
    error = (residuals * residuals).sum()
    return {'model': result, 'features': list(feature_set), 'rss': error}

def split(a, n): # if i were using processes, i could split using this
    k, m = divmod(len(a), n)
    return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def enumerate_subsets(k, x, y, threads=1): # enumerate subsets of size k (Gosper's hack can work, but we use itertools)
    print(f'calculating for subsets of size {k}')
    
    start = time.time()
    combos = list(itertools.combinations(x.columns, k))
    
    # multiprocessing fails in jupyter
    # pool = Pool(processes=3)
    # results = pool.map(calc_subset, combos)
    # pool.close()
    # pool.join()
    
    results = []
    for c in combos:
        results.append(calc_subset(c, x, y))
    
    end = time.time()
    elapsed = end - start
    print(f'done with size {k}, took {elapsed} seconds')
    results_df = pd.DataFrame(results)
    best = results_df.loc[results_df['rss'].argmin()]
    return dict(best)

In [191]:
features = len(x.columns)
features = min(features, 3) # otherwise we take too long
subset_selection = {i: enumerate_subsets(i, x, y) for i in range(1, features + 1)}
# i'm skipping the plotting too, because this takes forever

calculating for subsets of size 1
done with size 1, took 0.038878440856933594 seconds
calculating for subsets of size 2
done with size 2, took 0.3381383419036865 seconds
calculating for subsets of size 3
done with size 3, took 1.9585189819335938 seconds


In [166]:
def forwards_stepwise(predictors, x, y):
    remaining = [p for p in x.columns if p not in predictors]
    results = []
    for p in remaining:
        results.append(calc_subset(predictors + [p], x, y))
    models = pd.DataFrame(results)
    best_model = models.loc[models['rss'].argmin()]
    return dict(best_model)

def backwards_stepwise(predictors, x, y):
    results = []
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(calc_subset(combo, x, y))
    models = pd.DataFrame(results)
    best_model = models.loc[models['rss'].argmin()]
    return dict(best_model)

In [192]:
def forward(x, y):
    models_fwd = pd.DataFrame(columns=['rss', 'model'])

    predictors = []

    for i in range(1, len(x.columns) + 1):
        models_fwd.loc[i] = forwards_stepwise(predictors, x, y)
        predictors = models_fwd.loc[i]['model'].model.exog_names.copy()
        predictors.remove('const')
    
    return models_fwd

def backward(x, y):
    models_bwd = pd.DataFrame(columns=['rss', 'model'])

    predictors = x.columns
    while(len(predictors) > 1):  
        models_bwd.loc[len(predictors) - 1] = backwards_stepwise(predictors, x, y)
        predictors = models_bwd.loc[len(predictors) - 1]['model'].model.exog_names.copy()
        predictors.remove('const')
    
    return models_bwd

In [187]:
print(models_fwd.loc[7]['model'].summary())
print('---------------')
print(models_bwd.loc[7]['model'].summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.513
Model:                            OLS   Adj. R-squared:                  0.500
Method:                 Least Squares   F-statistic:                     38.41
Date:                Mon, 25 Jul 2022   Prob (F-statistic):           1.50e-36
Time:                        19:55:35   Log-Likelihood:                -1885.4
No. Observations:                 263   AIC:                             3787.
Df Residuals:                     255   BIC:                             3815.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -17.3351     64.535     -0.269      0.7