In [1]:
# Testing using Logistic Regression

In [2]:
#Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn import tree

In [10]:
# Read the dataset

df = pd.read_csv('/Users/ashleysmacbook/Desktop/Project-4/diabetes_binary_health_indicators_BRFSS2015.csv')

# Display the first few rows of the dataset
print(df.head())

# Explore basic statistics and information about the dataset
print(df.describe())
print(df.info())


FileNotFoundError: [Errno 2] No such file or directory: '/Users/ashleysmacbook/Desktop/Project-4/diabetes_binary_health_indicators_BRFSS2015.csv'

In [None]:
# Drop any missing values
df = df.dropna()

# Split the dataset into features (X) and target variable (y)
X = df.drop('Diabetes_binary', axis=1)  
y = df['Diabetes_binary']


In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Standardize using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
#Logistic regression model 
model = LogisticRegression(random_state=42)

#train the model
model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_scaled)


In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:\n', class_report)

# ROC-AUC
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])
print(f'ROC-AUC Score: {roc_auc}')

# Mean squared error (MSE) and R-squared (R2)
from sklearn.metrics import mean_squared_error, r2_score

# Predictions on the test set
y_pred = model.predict(X_test_scaled)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared (R2)
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R2): {r2}')


In [None]:
from sklearn.metrics import roc_curve

# Calculate ROC curve
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, model.predict_proba(X_test_scaled)[:, 1])

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Linear Regression)')
plt.legend()
plt.show()



In [None]:
# # Visualize the confusion matrix as a heatmap using seaborn
import seaborn as sns
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()



In [None]:
# Create a DataFrame to organize the coefficients of the logistic regression model
coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_[0]})

# Sort the DataFrame based on coefficient values in descending order
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

# Visualize feature importance by plotting coefficients as a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coef_df, palette='viridis')
plt.title('Feature Importance - Logistic Regression Coefficients')
plt.show()




In [None]:
#Testing another model using the Decision Tree

In [None]:
# Instantiate the Decision Tree classifier
decision_tree = tree.DecisionTreeClassifier(random_state=42)

In [None]:
# Train the model
decision_tree.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_pred_dt = decision_tree.predict(X_test_scaled)

In [None]:
# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {accuracy_dt}')

In [None]:
# Confusion Matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
print('Confusion Matrix (Decision Tree):\n', conf_matrix_dt)

In [None]:
# Classification Report
class_report_dt = classification_report(y_test, y_pred_dt)
print('Classification Report (Decision Tree):\n', class_report_dt)

In [None]:
# ROC-AUC Score
roc_auc_dt = roc_auc_score(y_test, decision_tree.predict_proba(X_test_scaled)[:, 1])
print(f'Decision Tree ROC-AUC Score: {roc_auc_dt}')

In [None]:

# Calculate ROC curve
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, decision_tree.predict_proba(X_test_scaled)[:, 1])

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_dt, tpr_dt, label=f'AUC = {roc_auc_dt:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Decision Tree)')
plt.legend()
plt.show()



In [None]:
# Calculate the confusion matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_dt, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (Decision Tree)')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()



In [4]:
# Create a DataFrame to organize feature importance of the decision tree model
feat_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': decision_tree.feature_importances_})

# Sort the DataFrame based on feature importance values in descending order
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)

# Visualize feature importance by plotting importance as a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_importance_df, palette='viridis')
plt.title('Feature Importance - Decision Tree')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()



NameError: name 'X' is not defined

In [5]:
import matplotlib.pyplot as plt

# Plot decision tree
plt.figure(figsize=(25, 20))
_ = tree.plot_tree(decision_tree, 
                   feature_names=X.columns,  
                   class_names=["0", "1"], 
                   filled=True)
plt.show()


NameError: name 'decision_tree' is not defined

<Figure size 2500x2000 with 0 Axes>

In [6]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Train the model
random_forest.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = random_forest.predict(X_test_scaled)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')

conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print('Confusion Matrix (Random Forest):\n', conf_matrix_rf)

class_report_rf = classification_report(y_test, y_pred_rf)
print('Classification Report (Random Forest):\n', class_report_rf)

roc_auc_rf = roc_auc_score(y_test, random_forest.predict_proba(X_test_scaled)[:, 1])
print(f'Random Forest ROC-AUC Score: {roc_auc_rf}')


NameError: name 'X_train_scaled' is not defined

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Assuming X_train and y_train are your training features and target
# Train a Random Forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Retrieve feature importances from the trained model
importances = rf_model.feature_importances_

# Create a DataFrame containing feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})

# Sort the features based on their importances
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualize feature importances using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Variable Importance Plot - Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


NameError: name 'X_train' is not defined

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

# Assuming X_train and y_train are your training features and target
# Train a Random Forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Extract one of the decision trees from the Random Forest model
tree_index = 0  # Change this index to extract a different tree from the Random Forest
tree_estimator = rf_model.estimators_[tree_index]

# Export the decision tree as a DOT file
dot_data = export_graphviz(tree_estimator, out_file=None, feature_names=X_train.columns, class_names=['0', '1'], filled=True)

# Create a graph from the DOT data
graph = pydotplus.graph_from_dot_data(dot_data)

# Display the decision tree graph
Image(graph.create_png())


NameError: name 'X_train' is not defined

In [9]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming X_train and y_train are your training features and target
# Train a Random Forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Get feature importances from the trained model
feature_importances = rf_model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(feature_importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Random Forest")
plt.bar(range(X_train.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()


NameError: name 'X_train' is not defined