In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

# Step 1: Load the dataset
url = "https://drive.google.com/uc?id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2"
diabetes_data = pd.read_csv(url)

# Step 2: Examine the dataset
print("Data Preview:")
print(diabetes_data.head())

print("\nDescriptive Statistics:")
print(diabetes_data.describe())

print("\nInfo about the dataset:")
print(diabetes_data.info())

# Step 3: Visualizations to understand distributions
# Distribution of numerical variables
diabetes_data.hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Correlation heatmap to check relationships between variables
corr = diabetes_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot for understanding relationships between features and outcome
sns.pairplot(diabetes_data, hue='Outcome', diag_kind='hist')
plt.show()

# Step 4: Handle missing values
# Check for missing values
print("\nMissing Values in the dataset:")
print(diabetes_data.isnull().sum())

# Replace missing values with median for numerical columns
diabetes_data.fillna(diabetes_data.median(), inplace=True)

# Step 5: Remove outliers using IQR method
Q1 = diabetes_data.quantile(0.25)
Q3 = diabetes_data.quantile(0.75)
IQR = Q3 - Q1
filtered_data = diabetes_data[~((diabetes_data < (Q1 - 1.5 * IQR)) | (diabetes_data > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 6: Split the dataset into training and testing sets
X = filtered_data.drop('Outcome', axis=1)
y = filtered_data['Outcome']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")

# Step 7: Hyperparameter tuning using GridSearchCV
dt_model = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print(f"\nBest Hyperparameters: {grid_search.best_params_}")

# Step 8: Train the model with the best hyperparameters
best_dt_model = grid_search.best_estimator_
best_dt_model.fit(X_train, y_train)

# Step 9: Evaluate the model on the test set
y_pred = best_dt_model.predict(X_test)

# Step 10: Calculate and display performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nModel Performance Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetic', 'Diabetic'], yticklabels=['Non-Diabetic', 'Diabetic'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, best_dt_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Step 11: Visualize the decision tree
plt.figure(figsize=(20, 10))
plot_tree(best_dt_model, filled=True, feature_names=X.columns, class_names=['Non-Diabetic', 'Diabetic'], fontsize=10)
plt.title('Decision Tree Visualization')
plt.show()

# Step 12: Check feature importance
importance = best_dt_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)

# Step 13: Sensitivity analysis to test model robustness
X_test_noisy = X_test.copy()
X_test_noisy['Glucose'] += np.random.normal(0, 0.1, size=X_test_noisy.shape[0])

# Predict on the noisy test set
y_pred_noisy = best_dt_model.predict(X_test_noisy)

# Evaluate the performance on the noisy data
roc_auc_noisy = auc(*roc_curve(y_test, best_dt_model.predict_proba(X_test_noisy)[:, 1])[:2])
print(f"\nROC AUC with noisy data: {roc_auc_noisy}")
