In [12]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [14]:
df=pd.read_csv("../data/heart.csv")

In [15]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [16]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [17]:
x=df.drop("target",axis=1)

In [18]:
x


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [19]:
y=df["target"]

In [20]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [21]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [22]:
num_column=x.select_dtypes(exclude="O").columns

In [23]:
num_column

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [24]:
cat_column=x.select_dtypes(include="O").columns

In [25]:
cat_column

Index([], dtype='object')

In [26]:
num_pipeline=Pipeline(
  
  steps= [
    ('imputer',SimpleImputer()),
    ('scaler',StandardScaler())
    
    
  ]
    
)

In [27]:
preprocessor = ColumnTransformer(
    [
    ('num_pipeline',num_pipeline,num_column)
    ])
    

In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [29]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
132,42,1,1,120,295,0,1,162,0,0.0,2,0,2
202,58,1,0,150,270,0,0,111,1,0.8,2,0,3
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2
176,60,1,0,117,230,1,1,160,1,1.4,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3


In [30]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,69,1,2,140,254,0,0,146,0,2.0,1,3,3
104,50,1,2,129,196,0,1,163,0,0.0,2,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3


In [31]:

preprocessor.fit_transform(x_train)


array([[-1.35679832,  0.72250438,  0.00809909, ...,  0.95390513,
        -0.68970073, -0.50904773],
       [ 0.38508599,  0.72250438, -0.97189094, ...,  0.95390513,
        -0.68970073,  1.17848036],
       [-0.92132724,  0.72250438,  0.98808912, ..., -0.69498803,
        -0.68970073, -0.50904773],
       ...,
       [ 1.58263146,  0.72250438,  1.96807914, ..., -0.69498803,
         0.32186034, -0.50904773],
       [-0.92132724,  0.72250438, -0.97189094, ...,  0.95390513,
        -0.68970073,  1.17848036],
       [ 0.92942484, -1.38407465,  0.00809909, ...,  0.95390513,
         1.33342142, -0.50904773]])

In [32]:

preprocessor.transform(x_test)


array([[ 2.76218225e-01,  7.22504380e-01, -9.71890936e-01,
         1.16949120e+00,  5.53408401e-01, -3.83300706e-01,
        -1.04610909e+00, -1.70875171e+00,  1.47790748e+00,
        -3.75556294e-01, -6.94988026e-01,  3.21860343e-01,
        -2.19657581e+00],
       [ 4.93953764e-01,  7.22504380e-01,  1.96807914e+00,
         2.36038903e+00,  7.81171723e-01, -3.83300706e-01,
        -1.04610909e+00,  3.98288831e-01, -6.76632341e-01,
        -7.39094787e-01, -6.94988026e-01, -6.89700735e-01,
         1.17848036e+00],
       [ 2.76218225e-01,  7.22504380e-01,  9.88089118e-01,
         1.16949120e+00, -2.29363312e+00,  2.60891771e+00,
         8.43132697e-01,  1.02591793e+00, -6.76632341e-01,
        -7.39094787e-01,  9.53905134e-01,  3.21860343e-01,
         1.17848036e+00],
       [ 1.67350456e-01, -1.38407465e+00, -9.71890936e-01,
         2.16772932e-01,  3.07778522e+00, -3.83300706e-01,
        -1.04610909e+00, -5.18701733e-03,  1.47790748e+00,
         8.05943807e-01, -6.94988026e

In [33]:

preprocessor.get_feature_names_out()

array(['num_pipeline__age', 'num_pipeline__sex', 'num_pipeline__cp',
       'num_pipeline__trestbps', 'num_pipeline__chol',
       'num_pipeline__fbs', 'num_pipeline__restecg',
       'num_pipeline__thalach', 'num_pipeline__exang',
       'num_pipeline__oldpeak', 'num_pipeline__slope', 'num_pipeline__ca',
       'num_pipeline__thal'], dtype=object)

In [34]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.fit_transform(x_test),columns=preprocessor.get_feature_names_out())

In [35]:
x_train

Unnamed: 0,num_pipeline__age,num_pipeline__sex,num_pipeline__cp,num_pipeline__trestbps,num_pipeline__chol,num_pipeline__fbs,num_pipeline__restecg,num_pipeline__thalach,num_pipeline__exang,num_pipeline__oldpeak,num_pipeline__slope,num_pipeline__ca,num_pipeline__thal
0,-1.356798,0.722504,0.008099,-0.616856,0.914034,-0.383301,0.843133,0.532781,-0.676632,-0.920864,0.953905,-0.689701,-0.509048
1,0.385086,0.722504,-0.971891,1.169491,0.439527,-0.383301,-1.046109,-1.753582,1.477907,-0.193787,0.953905,-0.689701,1.178480
2,-0.921327,0.722504,0.988089,1.169491,-0.300704,-0.383301,0.843133,-0.139679,-0.676632,2.350982,-0.694988,-0.689701,-0.509048
3,0.058483,-1.384075,0.008099,0.276318,0.059921,-0.383301,-1.046109,0.487950,-0.676632,0.351521,-0.694988,-0.689701,-0.509048
4,0.602822,0.722504,-0.971891,-0.795490,-0.319684,2.608918,0.843133,0.443119,1.477907,0.351521,0.953905,1.333421,1.178480
...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,-0.485856,0.722504,0.988089,0.574042,-0.262744,-0.383301,0.843133,0.577611,-0.676632,-0.375556,-0.694988,0.321860,1.178480
238,-0.376988,0.722504,0.988089,-2.165023,-0.376625,-0.383301,0.843133,0.174136,1.477907,-0.920864,0.953905,0.321860,1.178480
239,1.582631,0.722504,1.968079,1.764940,-0.243763,2.608918,-1.046109,-0.856969,-0.676632,-0.829979,-0.694988,0.321860,-0.509048
240,-0.921327,0.722504,-0.971891,-0.616856,0.040941,-0.383301,-1.046109,-0.274171,-0.676632,-0.193787,0.953905,-0.689701,1.178480


In [36]:
x_test

Unnamed: 0,num_pipeline__age,num_pipeline__sex,num_pipeline__cp,num_pipeline__trestbps,num_pipeline__chol,num_pipeline__fbs,num_pipeline__restecg,num_pipeline__thalach,num_pipeline__exang,num_pipeline__oldpeak,num_pipeline__slope,num_pipeline__ca,num_pipeline__thal
0,0.351938,0.520416,-0.817224,0.692351,0.670699,-0.545777,-0.861892,-1.438221,1.285369,-0.399336,-0.485744,0.073151,-2.005978
1,0.585288,0.520416,2.004512,1.728753,0.921953,-0.545777,-0.861892,0.450638,-0.777987,-0.692823,-0.485744,-0.819288,0.942568
2,0.351938,0.520416,1.063933,0.692351,-2.469984,1.832251,1.160239,1.013277,-0.777987,-0.692823,1.073751,0.073151,0.942568
3,0.235263,-1.921538,-0.817224,-0.136771,3.455437,-0.545777,-0.861892,0.088942,1.285369,0.554499,-0.485744,0.965589,0.942568
4,1.985390,-1.921538,1.063933,-1.380454,0.440382,1.832251,-0.861892,-0.714828,-0.777987,-0.839567,1.073751,0.073151,-0.531705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1.752039,0.520416,1.063933,0.174150,0.210065,-0.545777,-0.861892,-0.071812,-0.777987,0.627871,-0.485744,1.858028,0.942568
57,-0.464788,0.520416,1.063933,-0.395872,-1.004332,-0.545777,1.160239,0.611392,-0.777987,-0.839567,1.073751,-0.819288,-0.531705
58,1.635364,0.520416,-0.817224,0.381430,-1.067146,1.832251,1.160239,-0.272755,-0.777987,1.655078,-0.485744,0.965589,0.942568
59,0.701963,0.520416,-0.817224,0.433250,0.796326,-0.545777,-0.861892,-0.232566,1.285369,1.214846,-0.485744,0.965589,0.942568


In [37]:
models = {
    'LinearRegression': {'model': LinearRegression(), 'metric': 'r2'},
    'Lasso': {'model': Lasso(), 'metric': 'r2'},
    'Ridge': {'model': Ridge(), 'metric': 'r2'},
    'Elasticnet': {'model': ElasticNet(), 'metric': 'r2'},
    'RandomForestRegressor': {'model': RandomForestRegressor(), 'metric': 'r2'},
    'XGBRegressor': {'model': XGBRegressor(), 'metric': 'r2'},
    'LogisticRegression': {'model': LogisticRegression(), 'metric': 'accuracy'},
    'DecisionTree': {'model': DecisionTreeClassifier(), 'metric': 'accuracy'},
    'KNN': {'model': KNeighborsClassifier(), 'metric': 'accuracy'},
    'NaiveBayes': {'model': GaussianNB(), 'metric': 'accuracy'}
}


In [38]:
performance_scores = []

In [39]:
def evaluate_model(true, pred, metric):
    if metric == 'r2':
        score = r2_score(true, pred)
        mse = mean_squared_error(true, pred)
        mae = mean_absolute_error(true, pred)
        precision = None
        recall = None
        f1 = None
    else:  # classification metrics
        score = accuracy_score(true, pred)
        mse = None
        mae = None
        precision = precision_score(true, pred)
        recall = recall_score(true, pred)
        f1 = f1_score(true, pred)
    return score, mse, mae, precision, recall, f1

In [40]:
# Iterate over each model
for model_name, model_info in models.items():
    model = model_info['model']
    metric = model_info['metric']
    
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(x_test)
    
    # Evaluate the model
    score, mse, mae, precision, recall, f1 = evaluate_model(y_test, y_pred, metric)
    
    # Print model performance
    print("Model:", model_name)
    print(f"{metric.upper()} Score:", score)
    if metric == 'r2':
        print("Mean Squared Error:", mse)
        print("Mean Absolute Error:", mae)
    else:
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
    print("=" * 40)
    print("\n")
    
    # Append performance score to the list
    performance_scores.append((model_name, score))

Model: LinearRegression
R2 Score: 0.5744031813619461
Mean Squared Error: 0.10614185640852299
Mean Absolute Error: 0.24503753420968327


Model: Lasso
R2 Score: -0.002505382410990631
Mean Squared Error: 0.25002015449540427
Mean Absolute Error: 0.4975613060560899


Model: Ridge
R2 Score: 0.5743204689437862
Mean Squared Error: 0.10616248449883538
Mean Absolute Error: 0.2451949273877132


Model: Elasticnet
R2 Score: -0.002505382410990631
Mean Squared Error: 0.25002015449540427
Mean Absolute Error: 0.4975613060560899


Model: RandomForestRegressor
R2 Score: 0.5177318965517241
Mean Squared Error: 0.12027540983606556
Mean Absolute Error: 0.24262295081967214


Model: XGBRegressor
R2 Score: -0.04541053203154011
Mean Squared Error: 0.2607204981793253
Mean Absolute Error: 0.34561597568089847


Model: LogisticRegression
ACCURACY Score: 0.8852459016393442
Precision: 0.8787878787878788
Recall: 0.90625
F1 Score: 0.8923076923076922


Model: DecisionTree
ACCURACY Score: 0.8032786885245902
Precision: 0.9

In [41]:
if all(metric == 'r2' for _, metric in models.values()):
    max_score_model = max(performance_scores, key=lambda x: x[1])
else:
    max_score_model = max(performance_scores, key=lambda x: x[1])

In [42]:
print("Model with Highest Score:", max_score_model[0])
print("Maximum Score:", max_score_model[1])

Model with Highest Score: KNN
Maximum Score: 0.9016393442622951
