In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load the dataset (using a more flexible CSV reading approach)
try:
    data = pd.read_csv("pima-indians-diabetes.csv", header=None, on_bad_lines='skip')
    # If the dataset does not have headers, you can manually assign column names
    data.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
                    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Step 2: Data Pre-processing
# Check for missing values
print("Missing values in the dataset:")
print(data.isnull().sum())

# Handle missing values by filling them with the mean (this is one way; you can adjust it based on your needs)
data.fillna(data.mean(), inplace=True)

# Step 3: Split the data into features (X) and target (y)
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']  # Target variable (Outcome: 0 or 1)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a Naive Bayes classifier (GaussianNB)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Step 6: Evaluate the model's performance

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Visualize the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


Error loading dataset: Length mismatch: Expected axis has 1 elements, new values have 9 elements
Missing values in the dataset:
0    0
dtype: int64


TypeError: Could not convert ['# 1. Number of times pregnant# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test# 3. Diastolic blood pressure (mm Hg)# 4. Triceps skin fold thickness (mm)# 5. 2-Hour serum insulin (mu U/ml)# 6. Body mass index (weight in kg/(height in m)^2)# 7. Diabetes pedigree function# 8. Age (years)# 9. Class variable (0 or 1)'] to numeric