In [22]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [23]:
data = pd.read_csv('./diabetes_prediction_dataset - diabetes_prediction_dataset.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.columns

In [None]:
print ("Number of dublicate rows :", data.duplicated().sum())

In [None]:
data.drop_duplicates(inplace=True)
print ("Number of dublicate rows :", data.duplicated().sum())

In [None]:
data.isnull().sum()

In [None]:
for col in ['gender','smoking_history'] :
    print (f'Number of Column {col} is : ' , data[col].nunique())
    print (f'Number of Column {col} is : ' , data[col].unique())

In [32]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data['gender'] = label.fit_transform(data['gender'])
data['smoking_history'] = label.fit_transform(data['smoking_history'])

In [None]:
data['gender'].unique()


In [None]:
data['smoking_history'].unique()


In [None]:
data['gender'].head()


In [None]:
data['smoking_history'].head()

Histogram display Distribution of Diabetes


In [None]:

plt.hist(data['diabetes'], bins=10)
plt.title("Diagnosed with diabetes.", color='b')
plt.show()

In [None]:
data.shape

line charts


In [None]:
avg_blood_glucose = data.groupby('diabetes')['blood_glucose_level'].mean()

plt.figure(figsize=(20, 10))
plt.plot(avg_blood_glucose.index, avg_blood_glucose.values, marker='o', label='Average Blood Glucose Level')

plt.title('Average Blood Glucose Level by Diabetes', fontsize=14)
plt.xlabel('Diabetes', fontsize=12)
plt.ylabel('Average Blood Glucose Level', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7) 
plt.legend()
plt.show()


In [None]:
bmi_mean = data.groupby('diabetes')['bmi'].mean()

plt.figure(figsize=(20, 10))
plt.plot(bmi_mean.index, bmi_mean.values, marker='o', label='Average BMI')

plt.title('Average BMI by Diabetes ', fontsize=14)
plt.xlabel('Diabetes (0 = No, 1 = Yes)', fontsize=12)
plt.ylabel('Average BMI', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.3)  
plt.legend()
plt.show()


In [None]:
smoking_diabetes_percentage = data.groupby('smoking_history')['diabetes'].mean() * 100

plt.figure(figsize=(20, 10))
plt.plot(smoking_diabetes_percentage.index, smoking_diabetes_percentage.values, marker='o', label='Diabetes Percentage')

plt.title('Diabetes Percentage by Smoking History ', fontsize=14)
plt.xlabel('Smoking History', fontsize=12)
plt.ylabel('Diabetes Percentage (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.3)  
plt.legend()
plt.show()


In [None]:
heart_disease_percentage = data.groupby('diabetes')['heart_disease'].mean() * 100

plt.figure(figsize=(20, 10))
plt.plot(heart_disease_percentage.index, heart_disease_percentage.values, marker='o', color='blue', label='Heart Disease Percentage')


plt.title('Heart Disease Percentage by Diabetes (All Data)', fontsize=14)
plt.xlabel('Diabetes (0 = No, 1 = Yes)', fontsize=12)
plt.ylabel('Heart Disease Percentage (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=1)  
plt.legend()
plt.show()


pie chart


In [None]:


diabetes_data = data[data['diabetes'] == 1]

gender_counts = diabetes_data['gender'].value_counts()

plt.figure(figsize=(6, 4))
plt.pie(
    gender_counts, 
    labels=gender_counts.index, 
    autopct='%1.1f%%', 
    startangle=90, 
    colors=['skyblue', 'lightcoral'], 
    explode=[0.1 if i == 0 else 0 for i in range(len(gender_counts))]
)

plt.title('Gender Distribution (Diabetes = 1)', fontsize=14)
plt.show()



functionality of Seaborn Library


In [None]:

plt.figure(figsize=(10, 6))
sns.barplot(
    x='smoking_history', 
    y='blood_glucose_level', 
    data=data, 
    ci='sd', 
    palette='viridis'
)

plt.title('Comparison of Blood Glucose Levels by Smoking History', fontsize=14)
plt.xlabel('Smoking History', fontsize=12)
plt.ylabel('Average Blood Glucose Level', fontsize=12)
plt.xticks(rotation=45)  
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


sub plotting to draw more than one plot for comparison.


In [None]:

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.histplot(data=data, x='age', hue='diabetes', multiple='stack', kde=True, ax=axes[0], palette='Blues')
axes[0].set_title('Age Distribution by Diabetes')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Count')

sns.histplot(data=data, x='heart_disease', hue='diabetes', multiple='stack', kde=True, ax=axes[1], palette='Greens')
axes[1].set_title('Heart Disease by Diabetes')
axes[1].set_xlabel('Heart Disease')
axes[1].set_ylabel('Count')

sns.histplot(data['blood_glucose_level'], kde=True, ax=axes[2], color='orange')
axes[2].set_title('Blood Glucose Level Distribution')

plt.tight_layout()
plt.show()



Bubble plot


In [None]:



plt.figure(figsize=(10, 6))
bubble_size = data['bmi']  

sns.scatterplot(
    x='age', 
    y='blood_glucose_level', 
    size=bubble_size, 
    hue='smoking_history', 
    data=data, 
    sizes=(20, 200),  
    palette='cool'
)

plt.title('Bubble Plot: Age vs Blood Glucose Level (Bubble Size = BMI)', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Blood Glucose Level', fontsize=12)
plt.legend(title='Smoking History', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.5)
plt.show()


the functionality of Matplotlib to develop a scatter plot


In [None]:
from scipy.stats import linregress

plt.figure(figsize=(10, 6))
plt.scatter(data['age'], data['blood_glucose_level'], color='blue', alpha=0.6, label='Data Points')

slope, intercept, r_value, p_value, std_err = linregress(data['age'], data['blood_glucose_level'])
regression_line = slope * data['age'] + intercept

plt.plot(data['age'], regression_line, color='red', label=f'Best Fit Line (R²={r_value**2:.2f})')

plt.title('Scatter Plot with Regression Line', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Blood Glucose Level', fontsize=12)
plt.legend()
plt.grid(alpha=0.5)
plt.show()


In [None]:
data.columns

Bar chart


In [None]:
plt.bar(data['age'], data['heart_disease'])
plt.xlabel('Age')
plt.ylabel('Heart Disease Level')
plt.title('Age by Heart Disease Level')
plt.show()

Machine Learning


In [50]:
x= data.drop(columns=["diabetes"], axis= 1)
y= data["diabetes"] # Target

In [51]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size=0.5, random_state=0,stratify=y)

In [None]:
x_train.shape  , y_train.shape

In [None]:
x_test.shape  , y_test.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knnmodel = KNeighborsClassifier(n_neighbors=3)
knnmodel.fit(x_train,y_train)
print ( 'KNNModel Train Score is :', knnmodel.score(x_train , y_train) )
print ( 'KNNModel Test Score is :', knnmodel.score(x_test , y_test) )

In [None]:
y_pred1 = knnmodel.predict (x_test)
print('actual Output : ' , (y_test[:10].tolist()))
print('predict Output : ' , (y_pred1[:10].tolist()))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_test , y_pred1)
cm

In [None]:
import seaborn as sns 
sns.heatmap (cm , annot_kws= {"size" :12 },annot= True , fmt="g" , cmap= "YlGnBu")
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
print ( classification_report (y_test , y_pred1))

### Naive Bayes


In [None]:
#model 2
from sklearn.naive_bayes import GaussianNB
NBModel = GaussianNB()
NBModel.fit(x_train , y_train)
print ('NBModel Train Score is : ' , NBModel.score(x_train,y_train))
print ('NBModel Test Score is : ' , NBModel.score(x_test,y_test))

In [None]:
y_pred2 = NBModel.predict(x_test)
print('actual Output : ' , (y_test[:10].tolist()))
print('predict Output : ' , (y_pred2[:10].tolist()))

In [None]:
from sklearn.metrics import classification_report , confusion_matrix
cm = confusion_matrix(y_test , y_pred2)

In [None]:
import seaborn as sns 
sns.heatmap (cm , annot_kws= {"size" :12 },annot= True , fmt="g" , cmap= "YlGnBu")
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
print ( classification_report (y_test , y_pred2))

### Decision Tree


In [None]:
#model 3
from sklearn.tree import DecisionTreeClassifier
DTModel = DecisionTreeClassifier(criterion='gini') 
DTModel.fit(x_train,y_train)
print('DTModel Train Score is : ',DTModel.score(x_train,y_train))
print('DTModel Test Score is : ',DTModel.score(x_test,y_test))

In [None]:
y_pred=DTModel.predict(x_test)
print('actual Output  : ',(y_test[:5].tolist()))
print('predict output : ',y_pred[:5].tolist())

In [None]:
from sklearn.metrics import classification_report ,confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
import seaborn as sns
sns.heatmap(cm, annot=True, annot_kws={"size": 12},fmt="g", cmap="crest")
plt.show()

In [None]:
print(classification_report(y_test,y_pred))