In [18]:
## Q1.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display the first few rows and check for missing values
print(df.head())
print(df.info())
print(df.describe())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [12]:
## Q2.
# Handling missing values (replace 0s with NaN for relevant columns)
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# Handling outliers (if necessary, apply methods like winsorization or remove outliers)
# Example:
# from scipy.stats.mstats import winsorize
# df['Glucose'] = winsorize(df['Glucose'], limits=[0.05, 0.05])

# Transform categorical variables (if any) into dummy variables
# (None in this dataset as per description)
# Example:
# df = pd.get_dummies(df, columns=['CategoricalColumn'])

# Check again for any remaining missing values
print(df.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [19]:
## Q3.
from sklearn.model_selection import train_test_split

# Split into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (use a random seed for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## Q4.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Perform cross-validation to optimize hyperparameters
# Example with cross-validation to find optimal depth
depth_range = range(1, 10)
cv_scores = []
for depth in depth_range:
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Identify the optimal depth with the highest CV score
optimal_depth = depth_range[np.argmax(cv_scores)]
print(f'Optimal depth: {optimal_depth}')

# Train the decision tree with the optimal depth on the full training set
clf = DecisionTreeClassifier(max_depth=optimal_depth, random_state=42)
clf.fit(X_train, y_train)

In [None]:
## Q5.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# ROC curve and AUC
y_prob = clf.predict_proba(X_test)[:, 1]  # Probability predictions for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print confusion matrix
print('Confusion Matrix:')
print(cm)

In [None]:
## Q6.
# Extract feature importance
feature_importance = dict(zip(X.columns, clf.feature_importances_))
sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
print('Feature Importance:')
for feature, importance in sorted_importance:
    print(f'{feature}: {importance:.4f}')

# If needed, visualize the decision tree (optional)
# Example:
# from sklearn.tree import plot_tree
# plt.figure(figsize=(20, 10))
# plot_tree(clf, feature_names=X.columns, filled=True, fontsize=10)
# plt.show()

In [None]:
## Q7.
# Example: Apply the model to new data (if available) or simulate changes in existing data
# Assess sensitivity to variations in data inputs or parameters
# Example:
new_data = pd.read_csv('new_data.csv')
new_predictions = clf.predict(new_data)
# Perform sensitivity analysis or scenario testing based on domain expertise.