In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('heart.csv')

In [2]:
def remove_outliers_zscore(df, threshold=3):
    # Identify numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    
    # Calculate Z scores only for numeric columns
    z_scores_numeric = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    
    # Combine Z scores with non-numeric columns
    df_with_zscores = pd.concat([z_scores_numeric, df[df.columns.difference(numeric_columns)]], axis=1)
    
    # Remove outliers based on Z scores
    is_outlier = (np.abs(z_scores_numeric) < threshold).all(axis=1)
    filtered_df = df[is_outlier]
    
    return filtered_df

In [3]:
new_data = remove_outliers_zscore(df,threshold=3)

In [4]:
new_num_data=pd.get_dummies(new_data,drop_first=True)

In [5]:
new_num_data['ChestPainType_NAP'] = new_num_data['ChestPainType_NAP'].astype(int)
new_num_data['ChestPainType_TA'] = new_num_data['ChestPainType_TA'].astype(int)
new_num_data['RestingECG_Normal'] = new_num_data['RestingECG_Normal'].astype(int)
new_num_data['RestingECG_ST'] = new_num_data['RestingECG_ST'].astype(int)
new_num_data['ExerciseAngina_Y'] = new_num_data['ExerciseAngina_Y'].astype(int)
new_num_data['ST_Slope_Flat'] = new_num_data['ST_Slope_Flat'].astype(int)
new_num_data['ST_Slope_Up'] = new_num_data['ST_Slope_Up'].astype(int)
new_num_data['Sex_M'] = new_num_data['Sex_M'].astype(int)
new_num_data['ChestPainType_ATA'] = new_num_data['ChestPainType_ATA'].astype(int)

In [6]:
X=new_num_data.drop('HeartDisease',axis='columns')
y=new_num_data['HeartDisease']

In [7]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.46590022,  0.84963584, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.7455875 , -0.1185065 ,  0.79361247, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ...,  1.21514774,
         1.00111297, -0.88129441],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.8229452 ,
        -0.99888827,  1.13469459]])

In [8]:
from sklearn.svm import SVC

In [9]:
from sklearn.model_selection import cross_val_score

In [10]:
score=cross_val_score(SVC(),X_scaled,y,cv=5)

In [11]:
score

array([0.86666667, 0.83888889, 0.83333333, 0.85555556, 0.76536313])

In [12]:
from sklearn.ensemble import BaggingClassifier

In [13]:
bag=BaggingClassifier(
    n_estimators=100,
    base_estimator=SVC(),
    max_samples=0.8,
    oob_score=True,
    random_state=0
)

TypeError: BaggingClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
bag.fit(X_scaled,y)
bag.oob_score_



0.8776418242491657

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=10)

In [None]:
bag.fit(X_train,y_train)
bag.oob_score_



0.874826147426982

In [None]:
bag.score(X_test,y_test)

0.85

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
score=cross_val_score(DecisionTreeClassifier(),X_scaled,y,cv=5)

In [None]:
score

array([0.78888889, 0.76666667, 0.76111111, 0.67777778, 0.63128492])

In [None]:
bag_tree=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100,
    oob_score=True,
    random_state=0,
    max_samples=0.8
)

In [None]:
bag_tree.fit(X_train,y_train)



In [None]:
bag_tree.oob_score_

0.8525730180806675

In [None]:
bag_tree.score(X_test,y_test)

0.8388888888888889

In [None]:
bag_tree.score(X_train,y_train)

0.9972183588317107

In [None]:
bag_tree.score(X_scaled,y)

0.9655172413793104

In [None]:
bag.score(X_scaled,y)

0.899888765294772