### Question1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Display the first few rows of the dataset
print(data.head())

# Display summary statistics of the dataset
print(data.describe())

# Plot histograms for numeric variables
numeric_vars = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
data[numeric_vars].hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Plot a correlation heatmap
corr_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot to visualize relationships between variables
sns.pairplot(data, hue='Outcome', diag_kind='kde')
plt.show()
# Make sure to replace 'diabetes.csv' with the actual path to your dataset file. This code will help you load the dataset, display the first few rows, show summary statistics, plot histograms, visualize correlations, and create a pair plot to observe relationships between variables, differentiated by the outcome classes.

# This exploratory data analysis will provide insights into the distribution of variables and potential relationships between them, which will be helpful for building a decision tree model.

### Question2

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Handling missing values
# Replace 0 values with NaN for relevant columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zeros] = data[cols_with_zeros].replace(0, pd.NA)

# Drop rows with missing values
data.dropna(inplace=True)

# Removing outliers
# You can use different methods to detect and remove outliers
# Here's an example using z-score
from scipy.stats import zscore
z_scores = zscore(data[numeric_vars])
outliers = (z_scores > 3).any(axis=1)
data = data[~outliers]

# Transform categorical variables into dummy variables
# Assuming 'Pregnancies' might be considered categorical
data = pd.get_dummies(data, columns=['Pregnancies'], drop_first=True)

# Display the first few rows of the preprocessed dataset
print(data.head())
# In this code, I've handled missing values by replacing 0 values with NaN and then dropping rows with missing values. I've used the Z-score method to detect and remove outliers, and I've transformed the categorical variable 'Pregnancies' into dummy variables using the pd.get_dummies() function.

# Please replace 'diabetes.csv' with the actual path to your dataset file and adjust the code as needed for your specific preprocessing requirements.

### Question3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
data = pd.read_csv('preprocessed_diabetes.csv')

# Split the dataset into features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Set a random seed for reproducibility
random_seed = 42

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Display the shape of the training and test sets
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
#Replace 'preprocessed_diabetes.csv' with the actual path to your preprocessed dataset file. The code will split the data into features and target, then further split it into training and test sets using a random seed of 42 for reproducibility. You can adjust the test_size parameter to control the proportion of data allocated for testing.

### Question4

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Load the preprocessed dataset
data = pd.read_csv('preprocessed_diabetes.csv')

# Split the dataset into features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Set a random seed for reproducibility
random_seed = 42

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=random_seed)

# Define a range of hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_dt_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters:", best_params)

# Evaluate the model on the test set
accuracy = best_dt_model.score(X_test, y_test)
print("Test set accuracy:", accuracy)
# In this code, we use a Decision Tree classifier and perform grid search cross-validation to find the best hyperparameters for the model. The param_grid dictionary defines a range of hyperparameters to be tuned. The best hyperparameters are then used to evaluate the model's performance on the test set.

# Replace 'preprocessed_diabetes.csv' with the actual path to your preprocessed dataset file. You can adjust the hyperparameter ranges in the param_grid dictionary based on your preferences.

### Question5

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Load the preprocessed dataset
data = pd.read_csv('preprocessed_diabetes.csv')

# Split the dataset into features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Set a random seed for reproducibility
random_seed = 42

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=random_seed)

# Fit the model on the training data
dt_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = dt_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Calculate ROC curve and AUC
y_prob = dt_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0, 1], ['Non-Diabetic', 'Diabetic'])
plt.yticks([0, 1], ['Non-Diabetic', 'Diabetic'])
plt.show()
# In this code, we calculate metrics such as accuracy, precision, recall, and F1 score using Scikit-learn's metrics functions. We also calculate the ROC curve and AUC using the roc_curve and roc_auc_score functions. Then, we plot the ROC curve and confusion matrix to visualize the model's performance.

# Replace 'preprocessed_diabetes.csv' with the actual path to your preprocessed dataset file. The code provided will help you evaluate and visualize the performance of the Decision Tree model.

### Question6

In [None]:
#Decision Tree Interpretation:

# The Decision Tree consists of multiple splits, each dividing the data into subsets based on specific variables and thresholds.

#1. The root node's split is based on the "Glucose" variable with a threshold of 127.5. If a patient's plasma glucose concentration is less than or equal to 127.5, the model moves to the left branch. Otherwise, it moves to the right branch.

#2. In the left branch, the next split occurs based on the "BMI" variable with a threshold of 26.9. If a patient's BMI is less than or equal to 26.9, the model moves further down the left branch. Otherwise, it moves down the right branch.

#3. In the left-left branch, the Decision Tree looks at the "Age" variable with a threshold of 33.5. If a patient's age is less than or equal to 33.5, the model predicts "Non-Diabetic" (class 0). If the age is greater than 33.5, the model predicts "Diabetic" (class 1).

#4. In the left-right branch, the Decision Tree considers the "Pregnancies" variable with a threshold of 6.5. If a patient has fewer than or equal to 6.5 pregnancies, the model predicts "Non-Diabetic." If the patient has more than 6.5 pregnancies, the model predicts "Diabetic."

#5. The right branch of the root node (when glucose > 127.5) leads to predictions based on different features.

#The important variables are "Glucose," "BMI," "Age," and "Pregnancies." These variables are crucial in determining whether a patient is likely to be diabetic or not.

#Interpretation Summary:

#The Decision Tree model makes predictions based on several clinical variables, with "Glucose," "BMI," "Age," and "Pregnancies" being the most important features. It uses thresholds on these variables to create branches and make predictions. The Decision Tree's structure reflects patterns and relationships within the data, providing insights into how these features contribute to diabetes prediction.

# Keep in mind that the interpretation provided is based on a hypothetical scenario. In practice, it's important to closely analyze the actual Decision Tree structure and consider domain knowledge to ensure accurate interpretation.


### Question7

In [None]:
# Validating a decision tree model involves testing its performance on new data or assessing its robustness to various changes. Sensitivity analysis and scenario testing can help explore uncertainties and potential risks. Here's how you can validate the decision tree model for the diabetes prediction task:

#    New Data Testing:
#    Gather additional data that was not used during model training and evaluate the model's performance on this new data. This provides a measure of how well the model generalizes to unseen examples.

#    Cross-Validation:
#    Perform cross-validation on the training dataset to assess the model's stability and performance across different subsets of data. This helps ensure that the model is not overfitting to specific parts of the data.

#    Robustness Testing:
#    Introduce small perturbations or noise to the features in the dataset and observe how the model's predictions change. A robust model should not be overly sensitive to small variations in the input data.

#    Scenario Testing:
#    Test the model's performance on different scenarios that may arise in real-world situations. For instance, simulate cases where some features are missing, or when the distribution of certain features changes. This helps uncover how the model performs under varying conditions.

#    Sensitivity Analysis:
#    Conduct sensitivity analysis by modifying the model's hyperparameters or parameters and observing how it affects the model's predictions. This helps identify how sensitive the model is to changes and if certain settings lead to better or worse performance.

#    Feature Importance Validation:
#    Confirm the importance of features identified by the decision tree by testing the model's performance when certain features are excluded or varied. This ensures that the model's decisions are consistent with domain knowledge.

#    A/B Testing:
#    If possible, conduct A/B testing where you compare the decisions made by the decision tree model against other methods or human experts. This helps validate whether the model's predictions lead to better outcomes.

#    Domain Expert Review:
#    Collaborate with domain experts to validate the model's predictions and interpretations. Their input can provide valuable insights and ensure that the model aligns with medical knowledge.

#By performing these validation steps, you can gain a deeper understanding of the decision tree model's performance, its robustness, and how it behaves in different scenarios. This information is crucial for making informed decisions about deploying the model in real-world applications and understanding its limitations.