In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
# Uncomment the following if you want to use cross-validation
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("diabetes.csv")

In [None]:
# Exploratory Data Analysis
print("First 5 rows of the dataset:\n", df.head())

In [None]:
df.tail(10)


In [None]:
df.sample(10)


In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
print("\nDataset statistics:\n", df.describe())

In [None]:
# Data Cleaning
df.drop_duplicates()

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
# check number of zeroes
print("No of zero  values in Insulin",df[df['Insulin']==0].shape[0])

In [None]:
print("No of zero  values in Pregnancies",df[df['Pregnancies']==0].shape[0])

In [None]:
# Replace zero values with mean
# df['EstimatedSalary'].replace(0,df['EstimatedSalary'].mean(),inplace=True)

In [None]:
# Separate target variable and features
target_name = 'Outcome'
target = df[target_name]
data = df.drop(columns=[target_name])  # Dropping 'User ID' as it’s likely irrelevant

In [None]:
# Scale selected numerical features
scaler = StandardScaler()
data[['Glucose', 'Insulin', 'BMI', 'Age']] = scaler.fit_transform(data[['Glucose', 'Insulin', 'BMI', 'Age']])


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [None]:
# Implement Naive Bayes algorithm
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
# Making Predictions
nb_pred = nb.predict(X_test)
nb_pred.shape

In [None]:
# Model evaluation
print("Model Training Complete")
print("Train set accuracy:", nb.score(X_train, y_train))
print("Test set accuracy:", nb.score(X_test, y_test))

In [None]:
# Confusion matrix
conf_matrix=confusion_matrix(y_test,nb_pred)
print("Confusion Matrix:\n",conf_matrix)
report=classification_report(y_test,nb_pred)
print("Classification Report:\n",report)
