In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display first few rows and inspect columns
print(df.head())

# Check summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Visualize distributions and relationships
sns.pairplot(df, hue='Outcome')
plt.show()

# Correlation matrix heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Handling missing values (assuming missing values are marked as 0 for certain features)
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)
df.fillna(df.mean(), inplace=True)

# Split categorical variable if necessary (none in this case)

# Splitting the data into features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Standardize the features if necessary

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
dt_classifier.fit(X_train, y_train)

# Cross-validation for model optimization
cv_scores = cross_val_score(dt_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))


In [None]:
# Predicting the Test set results
y_pred = dt_classifier.predict(X_test)

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy score
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

# ROC Curve and AUC
y_pred_proba = dt_classifier.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label='Decision Tree Classifier')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# AUC Score
print("\nAUC Score:", roc_auc_score(y_test, y_pred_proba))


In [None]:
# Feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': dt_classifier.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Visualize the decision tree (if needed)
# Not shown here but can be visualized using tools like graphviz

# Interpretation of splits, branches, and leaves to identify important variables and thresholds
# Analyze which variables are most critical in predicting diabetes based on their splits and importance scores
