In [None]:
Q1. Import the dataset and examine the variables

Import necessary libraries:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load the dataset:


url = 'https://drive.google.com/uc?id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
diabetes_df = pd.read_csv(url)
Examine the dataset:


diabetes_df.head()
diabetes_df.info()
diabetes_df.describe()
Visualize the data:


sns.pairplot(diabetes_df, hue='Outcome')
plt.show()

# Check correlation between variables
plt.figure(figsize=(10, 8))
sns.heatmap(diabetes_df.corr(), annot=True, cmap='coolwarm')
plt.show()



Q2. Preprocess the data

Handle missing values:

diabetes_df.isnull().sum()
# Assuming no missing values based on the data examination

Remove outliers (example for one variable, similar can be done for others):

from scipy import stats

z_scores = stats.zscore(diabetes_df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])

abs_z_scores = np.abs(z_scores)

filtered_entries = (abs_z_scores < 3).all(axis=1)

diabetes_df = diabetes_df[filtered_entries]

Transform categorical variables (if necessary):

There are no categorical variables to transform into dummy variables in this dataset.



Q3. Split the dataset into training and test sets

Split the data:

from sklearn.model_selection import train_test_split

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Q4. Train a decision tree model
Train the decision tree:
python
Copy code
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
dt = DecisionTreeClassifier()

# Set up grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_
Q5. Evaluate the model
Evaluate using metrics:
python
Copy code
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

y_pred = best_dt.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

# ROC Curve
y_pred_proba = best_dt.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_value = auc(fpr, tpr)
plt.plot(fpr, tpr, label="AUC="+str(auc_value))
plt.legend(loc=4)
plt.show()
Q6. Interpret the decision tree
Interpret the tree:
python
Copy code
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(best_dt, filled=True, feature_names=X.columns)
plt.show()
Q7. Validate the decision tree model
Validate with new data (if available):

python
Copy code
# Assuming new_data is a DataFrame with the same features as the training data
new_data_predictions = best_dt.predict(new_data)
Sensitivity analysis:

python
Copy code
# Example sensitivity analysis
# Varying glucose levels
import numpy as np
glucose_levels = np.linspace(X['Glucose'].min(), X['Glucose'].max(), 100)
sensitivity_results = []

for glucose in glucose_levels:
    temp_data = X_test.copy()
    temp_data['Glucose'] = glucose
    preds = best_dt.predict(temp_data)
    sensitivity_results.append(preds.mean())

plt.plot(glucose_levels, sensitivity_results)
plt.xlabel('Glucose Levels')
plt.ylabel('Predicted Diabetes Probability')
plt.show()