In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls "drive/MyDrive/DSGP(Diabetes)"

Data  Python


In [4]:
diabetes = pd.read_csv('drive/MyDrive/DSGP(Diabetes)/Data/diabetes_012_health_indicators_BRFSS2015.csv')

In [5]:
diabetes

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [6]:
X = diabetes[['HighBP', 'HighChol', 'BMI', 'Smoker', 'PhysActivity', 'Fruits', 'Veggies', 'MentHlth', 'DiffWalk', 'Sex', 'Age', 'Income']]

In [7]:
X

Unnamed: 0,HighBP,HighChol,BMI,Smoker,PhysActivity,Fruits,Veggies,MentHlth,DiffWalk,Sex,Age,Income
0,1.0,1.0,40.0,1.0,0.0,0.0,1.0,18.0,1.0,0.0,9.0,3.0
1,0.0,0.0,25.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0
2,1.0,1.0,28.0,0.0,0.0,1.0,0.0,30.0,1.0,0.0,9.0,8.0
3,1.0,0.0,27.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,11.0,6.0
4,1.0,1.0,24.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,11.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1.0,1.0,45.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,5.0,7.0
253676,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,4.0
253677,0.0,0.0,28.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0
253678,1.0,0.0,23.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,7.0,1.0


In [8]:
y = diabetes['Diabetes_012']

In [9]:
y

Unnamed: 0,Diabetes_012
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
253675,0.0
253676,2.0
253677,0.0
253678,0.0


In [10]:
print("Class Distribution:\n", y.value_counts())

Class Distribution:
 Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64


In [11]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

In [13]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [14]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_resampled, y_train_resampled)
rf_predictions = rf_clf.predict(X_test)

In [15]:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train_resampled, y_train_resampled)
dt_predictions = dt_clf.predict(X_test)

In [16]:
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("\nRandom Forest Accuracy:", rf_accuracy)
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.7942552296856933
              precision    recall  f1-score   support

         0.0       0.88      0.89      0.88     64111
         1.0       0.03      0.02      0.02      1389
         2.0       0.34      0.35      0.34     10604

    accuracy                           0.79     76104
   macro avg       0.42      0.42      0.42     76104
weighted avg       0.79      0.79      0.79     76104



In [17]:
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("\nDecision Tree Accuracy:", dt_accuracy)
print(classification_report(y_test, dt_predictions))


Decision Tree Accuracy: 0.7717991169977925
              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87     64111
         1.0       0.03      0.03      0.03      1389
         2.0       0.29      0.30      0.29     10604

    accuracy                           0.77     76104
   macro avg       0.40      0.40      0.40     76104
weighted avg       0.78      0.77      0.77     76104

