# 先建立逐步迴歸向前與向後的函數 

In [13]:
import pandas as pd
import statsmodels.api as sm


def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included)) #一個一個變數列入
        new_pval = pd.Series(index=excluded,dtype='float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]  #每一個變數算p-value
        best_pval = new_pval.min() 
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin() #找最小值
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return included

def backward_regression(X, y,
                           threshold_out,
                           verbose=False):
    included=list(X.columns)
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() #全部變數放入迴歸模型
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:] #計算p-value
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [2]:
y= pd.read_csv("ret.csv")
y.head()

Unnamed: 0,ret
0,0.095215
1,0.045228
2,0.073642
3,0.0203
4,0.031626


In [3]:
X= pd.read_csv("data.csv")
X.head()

Unnamed: 0,beta_hml,beta_mk,beta_mom,beta_smb,firmcode,industry,logv,sigma,skewness,region
0,0.002111,0.877661,0.087371,-0.005359,600000,17,10.125396,0.416688,-0.106485,17
1,-0.011914,0.607542,0.105205,0.010021,600001,9,9.987034,0.536066,0.992389,30
2,0.004652,1.155926,0.025733,-0.0091,600002,14,8.62141,0.459964,0.492296,6
3,-0.003235,0.742394,0.106752,0.016514,600003,14,8.469752,0.521236,1.552359,26
4,0.000927,0.550775,0.042473,0.002473,600004,9,9.037125,0.458615,-1.585671,4


# 利用向前法選出變數

In [4]:
result = forward_regression(X, y,0.01)  
print('resulting features:')
print(result)

resulting features:
['sigma', 'skewness', 'logv']


# 利用選出的變數跑迴歸

In [5]:
import statsmodels.api as sm
#應變數ret 自變數是sigma,skewness,logv
pairf=pd.concat([X.sigma,X.skewness,X.logv],axis = 1)
model=sm.OLS(y,sm.add_constant(pairf)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.960
Method:                 Least Squares   F-statistic:                     6804.
Date:                Mon, 03 May 2021   Prob (F-statistic):               0.00
Time:                        01:02:17   Log-Likelihood:                 1227.4
No. Observations:                 850   AIC:                            -2447.
Df Residuals:                     846   BIC:                            -2428.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1460      0.020     -7.356      0.0

# 利用向後法選出變數

In [6]:
result = backward_regression(X, y,0.01)  
print('resulting features:')
print(result)

resulting features:
['beta_mk', 'beta_mom', 'logv', 'sigma', 'skewness']


# 利用選出的變數跑迴歸

In [7]:
import statsmodels.api as sm
#應變數ret 自變數是sigma,skewness,logv,beta_mk,beta_mom
pairf=pd.concat([X.sigma,X.skewness,X.logv,X.beta_mk,X.beta_mom],axis = 1)
model=sm.OLS(y,sm.add_constant(pairf)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.961
Model:                            OLS   Adj. R-squared:                  0.961
Method:                 Least Squares   F-statistic:                     4211.
Date:                Mon, 03 May 2021   Prob (F-statistic):               0.00
Time:                        01:02:21   Log-Likelihood:                 1241.1
No. Observations:                 850   AIC:                            -2470.
Df Residuals:                     844   BIC:                            -2442.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1229      0.020     -6.130      0.0

# 逐步挑選(Stepwise)-向前與向後混和

In [8]:
import pandas as pd
import statsmodels.api as sm


def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out=0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded,dtype='float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        #backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [9]:
y= pd.read_csv("ret.csv")
y.head()

Unnamed: 0,ret
0,0.095215
1,0.045228
2,0.073642
3,0.0203
4,0.031626


In [10]:
X= pd.read_csv("data.csv")
X.head()

Unnamed: 0,beta_hml,beta_mk,beta_mom,beta_smb,firmcode,industry,logv,sigma,skewness,region
0,0.002111,0.877661,0.087371,-0.005359,600000,17,10.125396,0.416688,-0.106485,17
1,-0.011914,0.607542,0.105205,0.010021,600001,9,9.987034,0.536066,0.992389,30
2,0.004652,1.155926,0.025733,-0.0091,600002,14,8.62141,0.459964,0.492296,6
3,-0.003235,0.742394,0.106752,0.016514,600003,14,8.469752,0.521236,1.552359,26
4,0.000927,0.550775,0.042473,0.002473,600004,9,9.037125,0.458615,-1.585671,4


In [11]:
result = stepwise_selection(X, y)
print('resulting features:')
print(result)

Add  sigma                          with p-value 0.0
Add  skewness                       with p-value 4.45653e-20
Add  logv                           with p-value 1.68853e-13
Add  beta_mk                        with p-value 0.0211539
Add  beta_mom                       with p-value 3.0479e-06
Add  industry                       with p-value 0.0356136
Add  beta_smb                       with p-value 0.0363007
Add  beta_hml                       with p-value 0.0127963
resulting features:
['sigma', 'skewness', 'logv', 'beta_mk', 'beta_mom', 'industry', 'beta_smb', 'beta_hml']


# 利用選出的變數跑迴歸

In [12]:
import statsmodels.api as sm
#應變數ret 自變數是sigma,skewness,logv,beta_mk,beta_mom,industry,beta_smb,beta_hml]
pairf=pd.concat([X.sigma,X.skewness,X.logv,X.beta_mk,X.beta_mom,X.industry,X.beta_smb,X.beta_hml],axis = 1)
model=sm.OLS(y,sm.add_constant(pairf)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.962
Method:                 Least Squares   F-statistic:                     2672.
Date:                Mon, 03 May 2021   Prob (F-statistic):               0.00
Time:                        01:02:28   Log-Likelihood:                 1248.6
No. Observations:                 850   AIC:                            -2479.
Df Residuals:                     841   BIC:                            -2437.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0910      0.022     -4.176      0.0