In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler




# Load the preprocessed dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Separate features and target variable
X = data.drop(columns=["Diabetes_012"])
y = data["Diabetes_012"]

In [2]:
data.shape

(253680, 22)

In [7]:
# Drop specified columns from features
X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])

# Separate the target variable
y = data['Diabetes_012']

# Now you can proceed with preprocessing the data, splitting it into training and testing sets, and further analysis.


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_y_pred = rf_classifier.predict(X_test)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
dt_y_pred = dt_classifier.predict(X_test)

In [9]:
# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 0.8412172816146326
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.47      0.20      0.28      6997

    accuracy                           0.84     50736
   macro avg       0.44      0.39      0.40     50736
weighted avg       0.79      0.84      0.81     50736


Decision Tree Classifier:
Accuracy: 0.7673446862188584
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.86      0.87     42795
         1.0       0.04      0.05      0.04       944
         2.0       0.29      0.33      0.31      6997

    accuracy                           0.77     50736
   macro avg       0.40      0.41      0.41     50736
weighted avg       0.78      0.77      0.77     50736



In [4]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional for tree-based models but can be beneficial)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Initialize and train a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_scaled, y_train)

In [6]:
# Predictions for Random Forest Classifier
rf_y_pred = rf_classifier.predict(X_test_scaled)

# Predictions for Decision Tree Classifier
dt_y_pred = dt_classifier.predict(X_test_scaled)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 0.8410990223904131
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.47      0.20      0.28      6997

    accuracy                           0.84     50736
   macro avg       0.44      0.39      0.40     50736
weighted avg       0.79      0.84      0.81     50736


Decision Tree Classifier:
Accuracy: 0.7678768527278461
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.86      0.87     42795
         1.0       0.04      0.05      0.04       944
         2.0       0.29      0.33      0.31      6997

    accuracy                           0.77     50736
   macro avg       0.40      0.41      0.41     50736
weighted avg       0.78      0.77      0.77     50736



In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.8333530432040366
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     42795
         1.0       0.02      0.00      0.00       944
         2.0       0.41      0.20      0.27      6997

    accuracy                           0.83     50736
   macro avg       0.43      0.38      0.39     50736
weighted avg       0.79      0.83      0.80     50736



In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_scaled, y_train)

# Initialize and train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Initialize and train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_scaled, y_train)

# Predictions for K-Nearest Neighbors Classifier
knn_y_pred = knn_classifier.predict(X_test_scaled)

# Predictions for Random Forest Classifier
rf_y_pred = rf_classifier.predict(X_test_scaled)

# Predictions for Decision Tree Classifier
dt_y_pred = dt_classifier.predict(X_test_scaled)

# Evaluate K-Nearest Neighbors Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.8302980132450332
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     42795
         1.0       0.03      0.00      0.00       944
         2.0       0.40      0.21      0.27      6997

    accuracy                           0.83     50736
   macro avg       0.43      0.39      0.39     50736
weighted avg       0.78      0.83      0.80     50736


Random Forest Classifier:
Accuracy: 0.8410990223904131
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.47      0.20      0.28      6997

    accuracy                           0.84     50736
   macro avg       0.44      0.39      0.40     50736
weighted avg       0.79      0.84      0.81     50736


Decision Tree Classifier:
Accuracy: 0.7678768527278461
Classification Re

In [17]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Initialize and train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_scaled, y_train)

# Predictions for Random Forest Classifier
rf_y_pred = rf_classifier.predict(X_test_scaled)

# Predictions for Decision Tree Classifier
dt_y_pred = dt_classifier.predict(X_test_scaled)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))


Random Forest Classifier:
Accuracy: 0.8412369914853358
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.47      0.20      0.28      6997

    accuracy                           0.84     50736
   macro avg       0.44      0.39      0.40     50736
weighted avg       0.79      0.84      0.81     50736


Decision Tree Classifier:
Accuracy: 0.7674432355723746
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.86      0.87     42795
         1.0       0.04      0.05      0.04       944
         2.0       0.29      0.33      0.31      6997

    accuracy                           0.77     50736
   macro avg       0.40      0.41      0.41     50736
weighted avg       0.78      0.77      0.77     50736



In [3]:
print("Shape of the dataset:", data.shape)

Shape of the dataset: (253680, 22)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Separate features and target variable
X = data.drop(columns=["Diabetes_012"])
y = data["Diabetes_012"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the classifier with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
best_rf_classifier.fit(X_train_scaled, y_train)

# Predictions for the best Random Forest Classifier
best_rf_y_pred = best_rf_classifier.predict(X_test_scaled)

# Evaluate the best Random Forest Classifier
print("Best Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, best_rf_y_pred))
print("Classification Report:\n", classification_report(y_test, best_rf_y_pred))


Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}
Best Random Forest Classifier:
Accuracy: 0.8500473036896878
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.56      0.16      0.25      6997

    accuracy                           0.85     50736
   macro avg       0.47      0.38      0.39     50736
weighted avg       0.80      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from xgboost import XGBClassifier


# Initialize XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train_scaled, y_train)

# Get the best parameters for XGBoost
best_params_xgb = grid_search_xgb.best_params_
print("Best Parameters for XGBoost:", best_params_xgb)

# Train the XGBoost classifier with the best parameters
best_xgb_classifier = XGBClassifier(**best_params_xgb, random_state=42)
best_xgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best XGBoost Classifier
best_xgb_y_pred = best_xgb_classifier.predict(X_test_scaled)

# Evaluate the best XGBoost Classifier
print("Best XGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test, best_xgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_xgb_y_pred))


Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best XGBoost Classifier:
Accuracy: 0.850618889940082
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.56      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.48      0.39      0.40     50736
weighted avg       0.81      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:



# Load the dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Define and preprocess your features and target variable
X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])
y = data['Diabetes_012']

# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


Best Parameters for LightGBM: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200}
Best LightGBM Classifier:
Accuracy: 0.8510525070955535
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.56      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.48      0.39      0.40     50736
weighted avg       0.81      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Drop specified columns
X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])
y = data['Diabetes_012']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Encode categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')  # Remove sparse=False
X_train_encoded = encoder.fit_transform(X_train_imputed)
X_test_encoded = encoder.transform(X_test_imputed)

# Scale numerical features
scaler = RobustScaler(with_centering=False)  # Specify with_centering=False
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# Now X_train_scaled, X_test_scaled, y_train, y_test can be used for modeling.


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train_scaled, y_train)

# Predict on the testing set
rf_y_pred = rf_classifier.predict(X_test_scaled)

# Print the results
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Classification Report:\n", classification_report(y_test, rf_y_pred))


Random Forest Classifier:
Accuracy: 0.8357773573005362
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     42795
         1.0       0.03      0.00      0.01       944
         2.0       0.43      0.19      0.27      6997

    accuracy                           0.84     50736
   macro avg       0.44      0.39      0.39     50736
weighted avg       0.79      0.84      0.80     50736

