In [1]:
# Cell 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import fetch_openml
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# Cell 2: Load the Pima Indians Diabetes Dataset from OpenML
# This is a classification dataset
diabetes = fetch_openml(name='diabetes', version=1, as_frame=True)
df = diabetes.frame

# Display the first few rows
df.head()


Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
# Cell 3: Basic Dataset Information
print("Dataset shape:", df.shape)
print("\nTarget distribution:")
print(df['class'].value_counts())


Dataset shape: (768, 9)

Target distribution:
class
tested_negative    500
tested_positive    268
Name: count, dtype: int64


In [7]:
# Cell 4: Preprocessing

# Convert target variable to binary (if it's in string format)
df['class'] = df['class'].map({'tested_positive': 1, 'tested_negative': 0})

# Separate features and labels
X = df.drop('class', axis=1)
y = df['class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
# Cell 5: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict
y_pred_dt = dt_model.predict(X_test)


In [11]:
# Cell 6: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)
