In [1]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


In [2]:
#data path 
file_path = 'D:\Python\CatBoost\heart_disease_uci.csv'

#read data with pandas from excel file
data = pd.read_csv(file_path)

#show first 5 lignes 
data.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
print(data.isnull().sum())

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [4]:
#remove none necessiry columns
data.drop(columns = ['dataset'], inplace= True)
data.drop(columns = ['id'], inplace= True)

In [5]:
#grant data to variable
features = data.drop('num', axis=1)

#target variable
target = data['num']

In [6]:
features

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,


In [7]:
target

0      0
1      2
2      1
3      0
4      0
      ..
915    1
916    0
917    2
918    0
919    1
Name: num, Length: 920, dtype: int64

In [8]:
#filling categorical NAN with a string so can catboost handle it correctly
features['fbs'] = features['fbs'].fillna('Missing')
features['restecg'] = features['restecg'].fillna('Missing')
features['exang'] = features['exang'].fillna('Missing')
features['slope'] = features['slope'].fillna('Missing')
features['thal'] = features['thal'].fillna('Missing')
#fill numeric NAN with the mean
features['trestbps'] = features['trestbps'].fillna(features['trestbps'].mean())
features['chol'] = features['chol'].fillna(features['chol'].mean())
features['thalch'] = features['thalch'].fillna(features['thalch'].mean())
features['oldpeak'] = features['oldpeak'].fillna(features['oldpeak'].mean())
features['ca'] = features['ca'].fillna(features['ca'].mean())

In [9]:
features

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,Male,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,0.000000,fixed defect
1,67,Male,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,3.000000,normal
2,67,Male,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,2.000000,reversable defect
3,37,Male,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,0.000000,normal
4,41,Female,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,0.000000,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,Missing,0.676375,Missing
916,62,Male,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,Missing,0.878788,Missing,0.676375,Missing
917,55,Male,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,Missing,0.676375,fixed defect
918,58,Male,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,Missing,0.878788,Missing,0.676375,Missing


In [10]:
#making sure all the values are full
print(features.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64


In [11]:
#For categorical features: correctly detected as object or category.
#For numerical features: they are float or int.
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    object 
 2   cp        920 non-null    object 
 3   trestbps  920 non-null    float64
 4   chol      920 non-null    float64
 5   fbs       920 non-null    object 
 6   restecg   920 non-null    object 
 7   thalch    920 non-null    float64
 8   exang     920 non-null    object 
 9   oldpeak   920 non-null    float64
 10  slope     920 non-null    object 
 11  ca        920 non-null    float64
 12  thal      920 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 93.6+ KB


In [12]:
# List of categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
#getting their indices
categorical_features_indices = [features.columns.get_loc(col) for col in categorical_features]

In [13]:
print(categorical_features_indices)

[1, 2, 5, 6, 8, 10, 12]


In [14]:
#devide data between train 80% and test 20%
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [15]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

ValueError: could not convert string to float: 'Male'

In [None]:
model = CatBoostClassifier(iterations=2000, learning_rate=0.05, depth=6, verbose=100)
#verbose=100  # Print progress every 100 iterations
# Train the model
model.fit(X_train, y_train, cat_features = categorical_features_indices, eval_set=(X_test, y_test))

In [16]:
#model predict
y_pred = model.predict(X_test)

NameError: name 'model' is not defined

In [17]:
#See the results of the model
print("Accuracy:", accuracy_score(y_test, y_pred)*100 , "%")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [18]:
import matplotlib.pyplot as plt

feature_importance = model.get_feature_importance()
plt.barh(features.columns, feature_importance)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

NameError: name 'model' is not defined