In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [3]:
data = pd.read_excel('/Clean_data.xlsx')

In [4]:
data = data[data['DIABETE3'].isin([1, 3])]

In [5]:
data = data.dropna()

In [6]:
X = data.drop(columns=['DIABETE3'])
y = data['DIABETE3']

In [7]:
y.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
DIABETE3,Unnamed: 1_level_1
3,0.904326
1,0.095674


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
rf = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample')

In [18]:
param_dist = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': [None] + list(np.arange(10, 101, 10)),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [31]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=1000,
    cv=skf,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [32]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits




KeyboardInterrupt: 

In [None]:
print(f"Best Hyperparameters: {random_search.best_params_}")

In [None]:
y_pred_proba = random_search.best_estimator_.predict_proba(X_test)[:, 1]

In [None]:
precision_scores = []
recall_scores = []

In [None]:
thresholds = np.arange(0.2, 1.1, 0.01)

In [None]:
for t in thresholds:
    y_pred_threshold = np.where(y_pred_proba > t, 1, 0)

    precision = precision_score(y_test, y_pred_threshold)
    recall = recall_score(y_test, y_pred_threshold)

    precision_scores.append((t, precision))
    recall_scores.append((t, recall))

    print(f"Threshold: {t:.2f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(thresholds, [p[1] for p in precision_scores], label='Precision')
plt.plot(thresholds, [r[1] for r in recall_scores], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs. Threshold')
plt.legend()
plt.show()