In [None]:
''' Q1. Import the Dataset and Examine the Variables'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Examine the first few rows
print(df.head())

# Descriptive statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Visualizations
sns.histplot(df['Glucose'], kde=True)
plt.show()

sns.boxplot(x='Outcome', y='BMI', data=df)
plt.show()

# Correlation matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
"Q2. Preprocess the Data"
# Handling missing values
df = df.fillna(df.median())  # or use more sophisticated imputation methods

# Detecting outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# No categorical variables, so no need for one-hot encoding


In [None]:
"Q3. Split the Dataset into a Training Set and a Test Set"
from sklearn.model_selection import train_test_split

# Split the data
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
"Q4. Train a Decision Tree Model"
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize the model
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and model
best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')
