## Utils

In [1]:
import pandas as pd
from catboost import CatBoostClassifier

### functions

In [2]:
def display_info(data, data_name):
    print((f"Size Of {data_name}: {data.shape}\n\nColumns: {data.columns}"))

## Code

In [3]:
original_data = pd.read_csv("train.csv", index_col="id")
original_data.head(1)

Unnamed: 0_level_0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence


### Split Data

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_copy = original_data.sample(100)
print(f"Shape Of Sample: {df_copy.shape}\n\nColumns: {df_copy.columns}")

Shape Of Sample: (100, 14)

Columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='object')


In [6]:
FEATURES = df_copy.drop(columns=["Heart Disease"])
display_info(FEATURES, "FEATURES")

Size Of FEATURES: (100, 13)

Columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object')


In [7]:
TARGET = df_copy["Heart Disease"]
print(f"Size: {TARGET.shape}")

Size: (100,)


In [8]:
X = FEATURES
y = TARGET

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,     
    random_state=42     
)

print(f"Train Shape Rows x Colums :{X_train.shape, y_train.shape}\n\nTest Shape Rows x Colums:{X_test.shape, y_test.shape}")

Train Shape Rows x Colums :((80, 13), (80,))

Test Shape Rows x Colums:((20, 13), (20,))


### Train

In [9]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

In [11]:
randomforest_params = {
    'n_estimators': 100,
    'min_samples_split': 2,
}


catboost_params = {
    'iterations': 100,
    'depth': 6,
    'learning_rate': 0.1,
    'random_state': 42,
    'verbose': 0,
}

catboost_model = CatBoostClassifier(**catboost_params)
randomforest_model = RandomForestClassifier(**randomforest_params)

### MLFLOW

In [12]:
import mlflow
import mlflow.sklearn

In [15]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Kaggle Experiment")

with mlflow.start_run(run_name="RandomForesst"):
    randomforest_model.fit(X_train, y_train)
    
    preds = randomforest_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds) 

    mlflow.log_param("model", randomforest_model)
    mlflow.log_metric("roc_auc", auc)
    mlflow.sklearn.log_model(randomforest_model, "model")

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run RandomForesst at: http://127.0.0.1:5000/#/experiments/294982726917469589/runs/b6891667f0d743e8b763cb7f4c746ccd
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/294982726917469589
