In [None]:
#Q1--
# Answer--
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
diabetes_data = pd.read_csv("diabetes.csv")

# Display the first few rows of the dataset
print(diabetes_data.head())

# Display descriptive statistics
print(diabetes_data.describe())

# Plot histograms for each variable
diabetes_data.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(diabetes_data.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
#Q2--
# Answer----
# Check for missing values
print(diabetes_data.isnull().sum())

# Remove outliers using IQR method
Q1 = diabetes_data.quantile(0.25)
Q3 = diabetes_data.quantile(0.75)
IQR = Q3 - Q1
diabetes_data_cleaned = diabetes_data[~((diabetes_data < (Q1 - 1.5 * IQR)) | (diabetes_data > (Q3 + 1.5 * IQR))).any(axis=1)]

# Encode categorical variables into dummy variables if necessary
# No categorical variables are present in this dataset

# Display the shape of the cleaned dataset
print("Shape of cleaned dataset:", diabetes_data_cleaned.shape)


In [None]:
#Q3--
# Answer--from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = diabetes_data_cleaned.drop('Outcome', axis=1)
y = diabetes_data_cleaned['Outcome']

# Split the dataset into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
#Q4--
# Answer--
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the decision tree model with the best hyperparameters
best_dt_model = DecisionTreeClassifier(**best_params, random_state=42)
best_dt_model.fit(X_train, y_train)


In [None]:
#Q5--
# Answer--from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Predict on the test set
y_pred = best_dt_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Plot confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Plot ROC curve
y_proba = best_dt_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
#Q6--
# Answer--
from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(best_dt_model, feature_names=X.columns, class_names=['Non-diabetic', 'Diabetic'], filled=True, fontsize=10)
plt.show()
##This code plots the decision tree, displaying the splits, branches, and leaves. Each node in the tree represents a decision based on a feature and threshold. The most important variables are those that appear higher up in the tree and are used for early splits, as they have the most significant impact on the predictions.

In [None]:
#Q7--
# Answer--
Apply the Model to New Data:

Load the new dataset with similar clinical variables into VS Code.
Use the trained decision tree model to predict outcomes for the new data.
Evaluate the model's performance metrics, such as accuracy, precision, recall, and F1 score, using Python code within VS Code.
Sensitivity Analysis:

Write Python code in VS Code to vary one or more input variables within a plausible range.
Use the trained decision tree model to predict outcomes for the varied input variables.
Analyze the changes in predictions and assess the model's sensitivity to input variable changes.
Scenario Testing:

Define different scenarios representing potential changes in the dataset or environment.
Implement Python code in VS Code to apply the decision tree model to each scenario.
Evaluate the model's predictions under different conditions and assess its consistency and reliability across scenarios.
Cross-Validation:

Utilize Python libraries like scikit-learn within VS Code to perform cross-validation on multiple subsets of the original dataset.
Assess the stability and generalization ability of the model by evaluating performance metrics across different folds of the data.
Out-of-Sample Testing:

Load additional datasets or test sets into VS Code.
Apply the trained decision tree model to these unseen datasets.
Evaluate the model's performance on the new data and assess its ability to generalize to unseen datasets.