In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Loading the CSV dataset

In [2]:
df= pd.read_csv('framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
df=df.dropna()

# Implementation of stepwise regression

In [5]:
# Perform forward selection
def forward_selection(X, y, threshold_in=0.05):
    initial_features = X.columns.tolist()
    best_features = []
    
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[best_features + [feature]])).fit()
            new_pval[feature] = model.pvalues[feature]
        
        min_p_value = new_pval.min()
        
        if min_p_value < threshold_in:
            best_features.append(new_pval.idxmin())
        else:
            break
    
    return best_features

In [6]:
selected_features = forward_selection(X, y)
print("Selected features using forward selection: ", selected_features)

  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)


Selected features using forward selection:  ['const', 'age', 'sysBP', 'male', 'glucose', 'cigsPerDay', 'prevalentStroke']


# Build the final model with the selected features


In [7]:
final_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
print(final_model.summary())

                            OLS Regression Results                            
Dep. Variable:             TenYearCHD   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     66.76
Date:                Wed, 31 May 2023   Prob (F-statistic):           4.79e-79
Time:                        09:07:21   Log-Likelihood:                -1255.6
No. Observations:                3658   AIC:                             2525.
Df Residuals:                    3651   BIC:                             2569.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.6990      0.044    -

# Insights from stepwise regression

* R-squared (0.099) and Adjusted R-squared (0.097): Around 10% of the variability in TenYearCHD can be explained by the model. It suggests other significant factors may not be included in the model.

* F-statistic (66.76) and Prob (F-statistic) (4.79e-79): The model is statistically significant with the F-statistic much larger than 1 and the p-value less than 0.05. It suggests at least one predictor variable impacts TenYearCHD.

* Regression Coefficients: Each coefficient represents the change in TenYearCHD for one unit increase in that variable, holding other variables constant. All predictors are statistically significant (p<0.05).

* Omnibus and Jarque-Bera Tests: These tests indicate that the residuals may not be normally distributed. This could impact the reliability of the regression assumptions.

* Condition Number (2.2e+03): A large condition number indicates potential issues with multicollinearity, suggesting correlations among predictor variables.