# Explore here

In [None]:
# Your code here
# Your code here
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

diabetes_data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv')
print(diabetes_data.head())
print(diabetes_data.info())
print(diabetes_data.describe())

# Handle duplicates
if diabetes_data.duplicated().sum() != 0:
    diabetes_data = diabetes_data.drop_duplicates()

# Handle outliers
invalid_zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
diabetes_data[invalid_zero_columns] = diabetes_data[invalid_zero_columns].replace(0, np.nan)

for col in invalid_zero_columns:
    diabetes_data[col] = diabetes_data[col].fillna(diabetes_data[col].median())


highest_insulin = diabetes_data[diabetes_data['Insulin'] == 846]
diabetes_data.loc[highest_insulin.index, 'Insulin'] = diabetes_data["Insulin"].median()
print(f'🔹 Highest insulin level patient: {diabetes_data.loc[highest_insulin.index]}')

# Handle null values
print(diabetes_data.isnull().sum())  
print(diabetes_data.describe())  

# Univariate analysis
fig, axis = plt.subplots(8,2, figsize=(14,40))
sns.histplot(ax=axis[0,0], data=diabetes_data, x='Pregnancies', kde=True)
axis[0,0].set_title("Distribution of Pregnancies")
axis[0,0].set_xlabel("Number of Pregnancies")
axis[0,0].set_ylabel("Count")
sns.boxplot(ax=axis[0,1], data=diabetes_data, x='Pregnancies')
axis[0,1].set_title("Boxplot of Pregnancies")
sns.histplot(ax=axis[1,0], data=diabetes_data, x='Glucose', kde=True)
axis[1,0].set_title("Distribution of Glucose Levels")
axis[1,0].set_xlabel("Glucose Level")
axis[1,0].set_ylabel("Count")
sns.boxplot(ax=axis[1,1], data=diabetes_data, x='Glucose')
axis[1,1].set_title("Boxplot of Glucose Levels")
sns.histplot(ax=axis[2,0], data=diabetes_data, x='BloodPressure', kde=True)
axis[2,0].set_title("Distribution of Blood Pressure")
axis[2,0].set_xlabel("Blood Pressure")
axis[2,0].set_ylabel("Count")
sns.boxplot(ax=axis[2,1], data=diabetes_data, x='BloodPressure')
axis[2,1].set_title("Boxplot of Blood Pressure")
sns.histplot(ax=axis[3,0], data=diabetes_data, x='SkinThickness', kde=True)
axis[3,0].set_title("Distribution of Skin Thickness")
axis[3,0].set_xlabel("Skin Thickness")
axis[3,0].set_ylabel("Count")
sns.boxplot(ax=axis[3,1], data=diabetes_data, x='SkinThickness')
axis[3,1].set_title("Boxplot of Skin Thickness")
sns.histplot(ax=axis[4,0], data=diabetes_data, x='Insulin', kde=True)
axis[4,0].set_title("Distribution of Insulin Levels")
axis[4,0].set_xlabel("Insulin Level")
axis[4,0].set_ylabel("Count")
sns.boxplot(ax=axis[4,1], data=diabetes_data, x='Insulin')
axis[4,1].set_title("Boxplot of Insulin Levels")
sns.histplot(ax=axis[5,0], data=diabetes_data, x='BMI', kde=True)
axis[5,0].set_title("Distribution of BMI")
axis[5,0].set_xlabel("BMI")
axis[5,0].set_ylabel("Count")
sns.boxplot(ax=axis[5,1], data=diabetes_data, x='BMI')
axis[5,1].set_title("Boxplot of BMI")
sns.histplot(ax=axis[6,0], data=diabetes_data, x='DiabetesPedigreeFunction', kde=True)
axis[6,0].set_title("Distribution of Diabetes Pedigree Function")
axis[6,0].set_xlabel("Diabetes Pedigree Function")
axis[6,0].set_ylabel("Count")
sns.boxplot(ax=axis[6,1], data=diabetes_data, x='DiabetesPedigreeFunction')
axis[6,1].set_title("Boxplot of Diabetes Pedigree Function")
sns.histplot(ax=axis[7,0], data=diabetes_data, x='Age', kde=True)
axis[7,0].set_title("Distribution of Age")
axis[7,0].set_xlabel("Age")
axis[7,0].set_ylabel("Count")
sns.boxplot(ax=axis[7,1], data=diabetes_data, x='Age')
axis[7,1].set_title("Boxplot of Age")

plt.figure()
sns.countplot(data=diabetes_data, x='Outcome')
plt.title("Diabetes Outcome Count")
plt.xlabel("Outcome (0 = No Diabetes, 1 = Diabetes)")
plt.ylabel("Count")

# Multivariate analysis
plt.figure(figsize=(12, 8))
sns.pairplot(diabetes_data, diag_kind="kde")
plt.show()

plt.figure(figsize=(12, 8))
sns.pairplot(diabetes_data, hue="Outcome", diag_kind="kde", markers=["o", "s"])
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(diabetes_data.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']

plt.figure(figsize=(14, 12))
for i, feature in enumerate(features, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(x="Outcome", y=feature, data=diabetes_data)
    plt.title(f"{feature} Distribution by Outcome")

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 10))
sns.pairplot(diabetes_data, vars=['Glucose', 'BMI', 'Age', 'Insulin'], hue="Outcome", markers=["o", "s"])

plt.show()

from sklearn.model_selection import train_test_split

X = diabetes_data.drop(['Outcome'], axis=1)
y = diabetes_data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=117)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score

model_accuracy = accuracy_score(y_test, y_pred)

print(f'Model accuracy: {model_accuracy}')

from sklearn import tree

fig, axis = plt.subplots(2, 1, figsize = (15, 15))

# We show the first 4 trees out of the 100 generated (default)
tree.plot_tree(model.estimators_[0], ax = axis[0], feature_names = list(X_train.columns), filled = True)
tree.plot_tree(model.estimators_[1], ax = axis[1], feature_names = list(X_train.columns), filled = True)

plt.show()

from pickle import dump

dump(model, open("random_forest_classifier_default_117.sav", "wb"))

from sklearn.model_selection import GridSearchCV

hyperparams = {
    'n_estimators': [15,20,25,50,100,150,200,],
    'max_depth': [None, 5, 10, 15, 20], 
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10], 
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [117]  
}
"""
grid = GridSearchCV(model, hyperparams, scoring='accuracy', cv=5)

# Suppress warnings due to incopatibilities or converges
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)
print(f'🔹 Best hyperparameters: {grid.best_params_}')

# 'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2, 'random_state': 117, 'splitter': 'best'


opt_model = RandomForestClassifier(max_depth= None, max_features= None, min_samples_leaf= 15, min_samples_split= 4, random_state= 117)

opt_model.fit(X_train, y_train)
opt_pred = opt_model.predict(X_test)

opt_accuracy = accuracy_score(y_test, opt_pred)
print(f'Optimized model accuracy: {opt_accuracy}')

"""

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768