In [2]:
import pandas as pd
import streamlit as st
from pycaret.classification import setup, create_model, predict_model
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Step 1: Data Reading and EDA

In [3]:
def load_data():
    # Assuming your data is in a CSV file
    data = pd.read_csv('heart.csv')
    return data

In [4]:
data = load_data()

# Step 2: EDA (Exploratory Data Analysis)


In [6]:
st.subheader('Exploratory Data Analysis (EDA)')
st.write(data.head())  # Displaying the first few rows of the dataset

In [7]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [8]:
data.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

# Step 3: Handling Missing Values

In [9]:
st.subheader('Handling Missing Values')
missing_values = data.isnull().sum()
st.write(missing_values)

In [10]:
missing_values_counts =data.isnull().sum()
print(missing_values_counts)

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


# Dropping columns with missing values

In [11]:
cols_to_drop = missing_values[missing_values > 0].index
data = data.drop(cols_to_drop, axis=1)
st.write('Updated dataset after dropping columns:')
st.write(data)

In [12]:
print(data.columns)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


# Step 4: Removing Columns

In [13]:
st.subheader('Removing Columns')
columns_to_remove = st.multiselect('exang', data.columns)
data = data.drop(columns_to_remove, axis=1)
st.write('Updated dataset after removing columns:')
st.write(data)

In [14]:
print(data.columns)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


# Step 5: Choosing X and Y Variables and Detecting Task Type

In [16]:
st.subheader('Choosing X and Y Variables and Detecting Task Type')
X_columns = st.multiselect('Select X variables', data.columns)
Y_column = st.selectbox('Select Y variable', data.columns)
task_type = st.selectbox('Select task type', ['Classification', 'Regression'])

# Step 6: Categorical Data Encoding

In [17]:
st.subheader('Categorical Data Encoding')

DeltaGenerator()

In [18]:
# Assuming you have categorical columns to encode
categorical_columns = []  # Add your categorical column names here

if categorical_columns:
    # One-Hot Encoding
    if task_type == 'Classification':
        encoder = OneHotEncoder(sparse=False, drop='first')
    else:
        encoder = OneHotEncoder(sparse=False)
    
    encoded_data = encoder.fit_transform(data[categorical_columns])
    encoded_columns = encoder.get_feature_names(categorical_columns)
    
    # Create a new DataFrame with encoded columns
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    
    # Concatenate the encoded DataFrame with the original data
    data = pd.concat([data, encoded_df], axis=1)
    
    # Drop the original categorical columns
    data = data.drop(categorical_columns, axis=1)
    
    st.write('Updated dataset after categorical encoding:')
    st.write(data)

In [19]:
def create_label_encoder():
    return LabelEncoder()

In [20]:
label_encoder = create_label_encoder()

In [21]:
data['sex']

0       1
1       1
2       1
3       1
4       0
       ..
1020    1
1021    1
1022    1
1023    0
1024    1
Name: sex, Length: 1025, dtype: int64

In [22]:
label_encoder.fit_transform(data['sex'])

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [23]:
# Step 7: PyCaret in Streamlit App
st.subheader('PyCaret in Streamlit App')


DeltaGenerator()

In [24]:
# Preprocess the data using PyCaret's setup function
preprocess_data = setup(data=data, target=Y_column)

Unnamed: 0,Description,Value
0,Session id,4668
1,Target,age
2,Target type,Multiclass
3,Target mapping,"29: 0, 34: 1, 35: 2, 37: 3, 38: 4, 39: 5, 40: 6, 41: 7, 42: 8, 43: 9, 44: 10, 45: 11, 46: 12, 47: 13, 48: 14, 49: 15, 50: 16, 51: 17, 52: 18, 53: 19, 54: 20, 55: 21, 56: 22, 57: 23, 58: 24, 59: 25, 60: 26, 61: 27, 62: 28, 63: 29, 64: 30, 65: 31, 66: 32, 67: 33, 68: 34, 69: 35, 70: 36, 71: 37, 74: 38, 76: 39, 77: 40"
4,Original data shape,"(1025, 14)"
5,Transformed data shape,"(1025, 14)"
6,Transformed train set shape,"(717, 14)"
7,Transformed test set shape,"(308, 14)"
8,Numeric features,13
9,Preprocess,True


In [26]:
# Create a classification model using PyCaret
model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9722,0.0,0.9722,0.9896,0.9771,0.9712,0.9716
1,0.9583,0.0,0.9583,0.9815,0.963,0.9569,0.9574
2,0.9583,0.0,0.9583,0.9741,0.9562,0.9569,0.9578
3,0.9028,0.0,0.9028,0.9278,0.8931,0.8994,0.9017
4,0.9444,0.0,0.9444,0.9444,0.9381,0.9424,0.943
5,0.9722,0.0,0.9722,0.9778,0.9691,0.9712,0.9716
6,0.9306,0.0,0.9306,0.9236,0.9177,0.928,0.9291
7,0.9296,0.0,0.9296,0.9254,0.9168,0.927,0.9281
8,0.9296,0.0,0.9296,0.9667,0.9292,0.9271,0.9288
9,0.9437,0.0,0.9437,0.9554,0.9414,0.9415,0.9421


In [28]:
# Fit the model
trained_model = predict_model(model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9286,0.9833,0.9286,0.9299,0.9233,0.926,0.9266


In [29]:
# Display the model summary
st.subheader('Model Summary')
st.write(trained_model)

In [38]:
from pycaret.classification import compare_models

In [39]:
beast_model=compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9846,0.999,0.981,0.9893,0.9849,0.9693,0.9697,0.29
lightgbm,Light Gradient Boosting Machine,0.9833,0.9975,0.9918,0.9774,0.9841,0.9665,0.9675,0.259
et,Extra Trees Classifier,0.9791,0.9997,0.9782,0.9821,0.9797,0.9581,0.9591,0.295
dt,Decision Tree Classifier,0.9763,0.9766,0.9646,0.9891,0.9763,0.9525,0.9536,0.125
gbc,Gradient Boosting Classifier,0.9707,0.9902,0.9782,0.9662,0.9718,0.9414,0.9422,0.273
ada,Ada Boost Classifier,0.9024,0.9706,0.9105,0.9014,0.9053,0.8047,0.8059,0.218
ridge,Ridge Classifier,0.8509,0.9109,0.8832,0.8386,0.8591,0.701,0.7045,0.122
lda,Linear Discriminant Analysis,0.8509,0.9109,0.8832,0.8386,0.8591,0.701,0.7045,0.127
lr,Logistic Regression,0.8481,0.9134,0.886,0.832,0.8573,0.6953,0.6986,0.274
nb,Naive Bayes,0.8425,0.8847,0.8833,0.8266,0.8526,0.6841,0.6885,0.124


In [40]:
predict_model(beast_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,prediction_label,prediction_score
129,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1,1,0.85
750,55,1,1,130,262,0,1,155,0,0.0,2,0,2,1,1,0.96
138,47,1,2,138,257,0,0,156,0,0.0,2,0,2,1,1,0.91
107,62,1,1,120,281,0,0,103,0,1.4,1,1,3,0,0,0.99
910,50,1,2,140,233,0,1,163,0,0.6,1,1,3,0,0,0.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,44,1,0,120,169,0,1,144,1,2.8,0,0,1,0,0,0.96
927,42,1,2,120,240,1,1,194,0,0.8,0,0,3,1,1,0.73
322,45,1,0,142,309,0,0,147,1,0.0,1,3,3,0,0,0.95
406,58,1,2,140,211,1,0,165,0,0.0,2,0,2,1,1,0.98


In [41]:
predict_model(beast_model,data.tail())

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,prediction_label,prediction_score
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1,1,1.0
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0,0,1.0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0,0,1.0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1,1,0.9
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0,0,1.0


In [42]:
predict_model(beast_model,data.drop('target',axis=1).tail())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,prediction_label,prediction_score
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1,1.0
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0,1.0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0,1.0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1,0.9
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0,1.0


In [43]:
from pycaret.classification import save_model

In [44]:
save_model(beast_model,model_name='ridge.model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'trestbps', 'chol',
                                              'thalach', 'oldpeak', 'slope',
                                              'ca'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
            