In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
from sklearn.tree import DecisionTreeClassifier

    
# with open("mod el.pkl", "wb") as file:
#     pickle.dump(tree, file)
   

In [2]:
#  1. Load Dataset

In [3]:
df = pd.read_csv("cardio_train.csv", sep=";")

# Remove ID column
if "id" in df.columns:
    df.drop(columns=["id"], inplace=True)

In [4]:
# 2. Preprocessing

In [8]:

# Age is in DAYS → convert to YEARS
df["age"] = (df["age"] / 365).astype(int)

# Separate features & target
X = df.drop("cardio", axis=1)
y = df["cardio"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling (RF does not need scaling, but helps sometimes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [9]:
# 3. Train Random Forest Model


In [10]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=4,
    random_state=42
)

model.fit(X_train_scaled, y_train)

In [11]:
# 4. Evaluate Model


In [32]:
pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

Accuracy: 0.7391428571428571

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.78      0.75      6988
           1       0.76      0.70      0.73      7012

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000



In [12]:
from sklearn.model_selection import GridSearchCV


In [13]:
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}


In [14]:
model = DecisionTreeClassifier()


In [19]:
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,              # 5-fold cross validation
    scoring='accuracy',
    n_jobs=-1          # use all CPU cores (faster)
)


In [20]:
grid.fit(X_train, y_train)


In [21]:
print("Best Parameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)


Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best Accuracy: 0.7258571428571429


In [22]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.7283571428571428
