In [9]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
import sklearn
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
x = df.drop(['target'], axis = 1)
y = df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=11)

In [14]:
class ModelWrapper:
    def __init__(self, model):
        self.model = model

    def fit(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def predict(self, x_test):
        return self.model.predict(x_test)

    def evaluate(self, x_test, y_test):
        y_pred = self.predict(x_test)
        return accuracy_score(y_test, y_pred)

# SVM
svm_model = ModelWrapper(svm.SVC())
svm_model.fit(x_train, y_train)
print("SVM Accuracy:", svm_model.evaluate(x_test, y_test))

# 决策树
dt_model = ModelWrapper(tree.DecisionTreeClassifier())
dt_model.fit(x_train, y_train)
print("Decision Tree Accuracy:", dt_model.evaluate(x_test, y_test))

# K-最近邻
kn_model = ModelWrapper(KNeighborsClassifier(n_neighbors=3))
kn_model.fit(x_train, y_train)
print("K-Nearest Neighbors Accuracy:", kn_model.evaluate(x_test, y_test))

# 随机森林
rf_model = ModelWrapper(RandomForestClassifier(max_depth=3, random_state=1))
rf_model.fit(x_train, y_train)
print("Random Forest Accuracy:", rf_model.evaluate(x_test, y_test))


SVM Accuracy: 0.5483870967741935
Decision Tree Accuracy: 0.8387096774193549
K-Nearest Neighbors Accuracy: 0.6129032258064516
Random Forest Accuracy: 0.8064516129032258


In [15]:
x_train = sm.add_constant(x_train)

# 构建并拟合Logistic回归模型
model = sm.Logit(y_train, x_train)
result = model.fit()

# 输出结果摘要，包含Wald检验和似然比检验等信息
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.360244
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  272
Model:                          Logit   Df Residuals:                      258
Method:                           MLE   Df Model:                           13
Date:                Sun, 07 Jan 2024   Pseudo R-squ.:                  0.4773
Time:                        17:15:45   Log-Likelihood:                -97.986
converged:                       True   LL-Null:                       -187.48
Covariance Type:            nonrobust   LLR p-value:                 2.741e-31
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.6980      2.670      1.385      0.166      -1.535       8.931
age           -0.0080      0.

In [16]:
# 计算并打印每个变量的VIF
x = sm.add_constant(x)
vif = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
print(vif)

[207.2566461788931, 1.4434740782901314, 1.161866466826186, 1.2844555271417006, 1.1705913302539475, 1.1501744194351387, 1.0873790055528954, 1.06099762942633, 1.6137263028380084, 1.402000866050906, 1.7058565399785268, 1.6425945624883387, 1.2025702657052253, 1.1472786685888456]


In [17]:
class StepwiseRegression:
    def __init__(self, threshold_in=0.01, threshold_out=0.05, verbose=True):
        self.threshold_in = threshold_in
        self.threshold_out = threshold_out
        self.verbose = verbose
        self.included_features = []

    def fit(self, X, y, initial_list=[]):
        self.included_features = list(initial_list)
        while True:
            changed = False
            excluded = list(set(X.columns) - set(self.included_features))
            new_pval = pd.Series(index=excluded, dtype='float64')
            for new_column in excluded:
                try:
                    model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[self.included_features + [new_column]]))).fit()
                    new_pval[new_column] = model.pvalues[new_column]
                except Exception as e:
                    if self.verbose:
                        print(f'Error fitting model with {new_column}: {e}')
                    continue

            best_pval = new_pval.min()
            if best_pval < self.threshold_in:
                best_feature = new_pval.idxmin()
                self.included_features.append(best_feature)
                changed = True
                if self.verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

            try:
                model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[self.included_features]))).fit()
                pvalues = model.pvalues.iloc[1:]
                worst_pval = pvalues.max()
                if worst_pval > self.threshold_out:
                    changed = True
                    worst_feature = pvalues.idxmax()
                    self.included_features.remove(worst_feature)
                    if self.verbose:
                        print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
            except Exception as e:
                if self.verbose:
                    print(f'Error fitting model for backward elimination: {e}')
                break

            if not changed:
                break

    def get_features(self):
        return self.included_features

# 使用示例
model = StepwiseRegression()
model.fit(x_train, y_train)
print('resulting features:')
print(model.get_features())


Optimization terminated successfully.
         Current function value: 0.636938
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.660511
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.582711
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.579007
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614815
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.592909
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.622275
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.689249
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.679557
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689239
  

In [18]:
columns_to_remove = ['fbs','age','chol','restecg','trestbps','slope','exang']
df_reduced1 = df.drop(columns=columns_to_remove)
df_reduced1.head()

Unnamed: 0,sex,cp,thalach,oldpeak,ca,thal,target
0,1,3,150,2.3,0,1,1
1,1,2,187,3.5,0,2,1
2,0,1,172,1.4,0,2,1
3,1,1,178,0.8,0,2,1
4,0,0,163,0.6,0,2,1


In [14]:
x = df_reduced1.drop(['target'], axis = 1)
y = df_reduced1['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=11)

In [15]:
# 构建并拟合Logistic回归模型
model = sm.Logit(y_train, x_train)
result = model.fit()

# 输出结果摘要，包含Wald检验和似然比检验等信息
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.387010
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  272
Model:                          Logit   Df Residuals:                      266
Method:                           MLE   Df Model:                            5
Date:                Sun, 07 Jan 2024   Pseudo R-squ.:                  0.4385
Time:                        16:37:55   Log-Likelihood:                -105.27
converged:                       True   LL-Null:                       -187.48
Covariance Type:            nonrobust   LLR p-value:                 1.132e-33
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
sex           -1.2485      0.418     -2.987      0.003      -2.068      -0.429
cp             0.9379      0.

In [16]:
y_pred_prob = result.predict(x_test)

y_pred = [1 if x > 0.5 else 0 for x in y_pred_prob]

# 评估模型性能
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[11  3]
 [ 2 15]]
              precision    recall  f1-score   support

           0       0.85      0.79      0.81        14
           1       0.83      0.88      0.86        17

    accuracy                           0.84        31
   macro avg       0.84      0.83      0.84        31
weighted avg       0.84      0.84      0.84        31

