In [34]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook

from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

In [51]:
# Load the spam data
data = pd.read_csv('../data/prostate.data',sep='\t',index_col=0)

# # # Split in train/test
train_data = data[data.train=='T']
test_data = data[data.train=='F']

train_data = train_data.drop(['train'],axis=1)
test_data = test_data.drop(['train'],axis=1)

In [53]:
data.shape

(97, 10)

In [10]:
X_train = train_data.iloc[:,0:56].values
y_train = train_data.iloc[:,57].values

X_test = test_data.iloc[:,0:56].values
y_test = test_data.iloc[:,57].values

## Best subset selection (Backward-stepwise)

In [11]:


n_features = X_train.shape[1]+1

removed_feature_r2 = np.zeros((n_features,3))


y_train_ss = y_train
feature_to_remove = np.NaN

for ind in range(n_features):
    
    if np.isnan(feature_to_remove):
        X_train_ols_ss = sm.add_constant(X_train)
        X_test_ols_ss = sm.add_constant(X_test)
    else:
        X_train_ols_ss = np.delete(X_train_ols_ss,feature_to_remove,1)
        X_test_ols_ss = np.delete(X_test_ols_ss,feature_to_remove,1)

    model = sm.OLS(y_train, X_train_ols_ss)
    results = model.fit()
    y_pred = results.predict(X_test_ols_ss)

    removed_feature_r2[ind,0] = feature_to_remove
    removed_feature_r2[ind,1] = results.rsquared_adj
    removed_feature_r2[ind,2] = mean_squared_error(y_test, y_pred)

    feature_to_remove = np.argmin(results.tvalues)


In [12]:
fig,ax = plt.subplots()
ax.plot(removed_feature_r2[:,1],'ro')
# ax.plot(removed_feature_r2[:,1],'ro')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fc78998ff28>]

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

n_features = X_train.shape[1]+1

removed_feature_r2 = np.zeros((n_features-1,2))
X_train_ols_ss = sm.add_constant(X_train)

for nf in range(1,n_features):
    
    X_new = SelectKBest(chi2, k=nf).fit_transform(X_train_ols_ss, y_train)
    
    model = sm.OLS(y_train, X_new)
    results = model.fit()
    
    removed_feature_r2[nf-1,0] = feature_to_remove
    removed_feature_r2[nf-1,1] = results.rsquared_adj

In [14]:
fig,ax = plt.subplots()
ax.plot(removed_feature_r2[:,1],'ro')
# ax.plot(removed_feature_r2[:,1],'ro')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fc789acdc50>]

In [15]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.718
Method:                 Least Squares   F-statistic:                     140.7
Date:                Fri, 22 Mar 2019   Prob (F-statistic):               0.00
Time:                        22:36:25   Log-Likelihood:                -964.28
No. Observations:                3065   AIC:                             2041.
Df Residuals:                    3009   BIC:                             2378.
Df Model:                          56                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0143      0.019     -0.733      0.4