In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, accuracy_score,
    precision_score, recall_score, f1_score, roc_curve
)
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('diabetes.csv')

# Replace zero values in key columns with NaN
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)

# Apply mean imputation
imputer = SimpleImputer(strategy='mean')
df[cols_with_zero] = imputer.fit_transform(df[cols_with_zero])

# Split features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Apply PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Logistic Regression Model
log_reg = LogisticRegression(penalty='l1', solver='liblinear')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Decision Tree Model
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=5)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Visualizing Decision Tree
plt.figure(figsize=(12, 8))
plot_tree(dt, filled=True, feature_names=X.columns, class_names=['Non-diabetic', 'Diabetic'], rounded=True)
plt.title('Decision Tree Visualization')
plt.show()

# Neural Network Model
model = Sequential([
    Dense(16, input_dim=X_train.shape[1], activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test), verbose=0)

# Voting Classifier
voting_clf = VotingClassifier(estimators=[('log_reg', log_reg), ('dt', dt)], voting='soft')
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

# Model Evaluation
print("\nLogistic Regression Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(f"Precision: {precision_score(y_test, y_pred_log_reg)}")
print(f"Recall: {recall_score(y_test, y_pred_log_reg)}")
print(f"F1 Score: {f1_score(y_test, y_pred_log_reg)}")
print(f"ROC-AUC: {roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])}")
print(confusion_matrix(y_test, y_pred_log_reg))

print("\nDecision Tree Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")
print(f"Precision: {precision_score(y_test, y_pred_dt)}")
print(f"Recall: {recall_score(y_test, y_pred_dt)}")
print(f"F1 Score: {f1_score(y_test, y_pred_dt)}")
print(f"ROC-AUC: {roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])}")
print(confusion_matrix(y_test, y_pred_dt))

print("\nVoting Classifier Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting)}")
print(f"ROC-AUC: {roc_auc_score(y_test, voting_clf.predict_proba(X_test)[:, 1])}")
print(classification_report(y_test, y_pred_voting))


ModuleNotFoundError: No module named 'tensorflow'