In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv('/content/diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values (if any) - Example using mean for numerical columns
for col in ['gender', 'smoking_history']:
  df[col].fillna(df[col].mode()[0], inplace=True)

# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)

# Normalize numerical features (example using min-max scaling)
from sklearn.preprocessing import MinMaxScaler

numerical_cols = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Convert target variable to numerical (if needed)
# Example: If 'diabetes' is represented as 'yes'/'no'
# df['diabetes'] = df['diabetes'].map({'yes': 1, 'no': 0})

print(df.head())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
        age  hypertension  heart_disease       bmi  HbA1c_level  \
0  1.000000           0.0            1.0  0.177171     0.563636   
1  0.674675           0.0            0.0  0.202031     0.563636   
2  0.349349           0.0            0.0  0.202031     0.400000   
3  0.449449           0.0            0.0  0.156863     0.272727   
4  0.949950           1.0            1.0  0.118231     0.236364   

   blood_glucose_level  diabetes  gender_Male  gender_Other  \
0             0.272727         0        False         False   
1             0.000000         0        False         False   
2             0.354545         0         True         False   
3             0.340909         0        False         False   
4             0.340909         0         Tru

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Define features (X) and target (y)
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
}

# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test, y_pred)
    }
    print(f"{name} Accuracy: {accuracy}")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")


# Linear Regression (for comparison, though not ideal for classification)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_lr = linear_reg.predict(X_test)

# Convert predicted probabilities to class labels (0 or 1)
y_pred_lr_class = np.where(y_pred_lr > 0.5, 1, 0)

mse = mean_squared_error(y_test, y_pred_lr)
accuracy_lr = accuracy_score(y_test, y_pred_lr_class)
print("\nLinear Regression:")
print(f"Mean Squared Error: {mse}")
print(f"Accuracy: {accuracy_lr}")
print(classification_report(y_test, y_pred_lr_class))

Random Forest Accuracy: 0.96995
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000

K-Nearest Neighbors Accuracy: 0.96095
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.90      0.61      0.73      1708

    accuracy                           0.96     20000
   macro avg       0.93      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000





AdaBoost Accuracy: 0.97245
AdaBoost Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18292
           1       0.97      0.70      0.81      1708

    accuracy                           0.97     20000
   macro avg       0.97      0.85      0.90     20000
weighted avg       0.97      0.97      0.97     20000

Decision Tree Accuracy: 0.9517
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     18292
           1       0.71      0.74      0.72      1708

    accuracy                           0.95     20000
   macro avg       0.84      0.86      0.85     20000
weighted avg       0.95      0.95      0.95     20000


Linear Regression:
Mean Squared Error: 0.051179168858889484
Accuracy: 0.9386
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18292
           1       0.99      0.2

In [None]:

# Create a new person's data using the same columns as X_train
new_person_data = pd.DataFrame({col: [0] for col in X_train.columns})

# Set the relevant feature values for the new person
new_person_data.loc[0, 'age'] = 50  # Replace with the actual age
new_person_data.loc[0, 'hypertension'] = 0  # Replace with 1 for yes, 0 for no
new_person_data.loc[0, 'heart_disease'] = 0  # Replace with 1 for yes, 0 for no
new_person_data.loc[0, 'bmi'] = 28  # Replace with actual BMI
new_person_data.loc[0, 'HbA1c_level'] = 6.5  # Replace with the actual HbA1c level
new_person_data.loc[0, 'blood_glucose_level'] = 140  # Replace with actual glucose level
new_person_data.loc[0, 'gender_Male'] = 1  # Replace with 1 for male, 0 for female
new_person_data.loc[0, 'gender_Other'] = 0  # Add this column, and set to 1 if gender is 'Other', 0 otherwise
new_person_data.loc[0, 'smoking_history_never'] = 1  # Replace with appropriate value from one-hot encoding
new_person_data.loc[0, 'smoking_history_former'] = 0
new_person_data.loc[0, 'smoking_history_current'] = 0
new_person_data.loc[0, 'smoking_history_not current'] = 0
new_person_data.loc[0, 'smoking_history_ever'] = 0
# ... other features



# Scale the numerical features of the new data using the same scaler
numerical_cols = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
new_person_data[numerical_cols] = scaler.transform(new_person_data[numerical_cols])


# Choose the best performing classifier from the previous results (e.g., RandomForest)
best_classifier = RandomForestClassifier(random_state=42)
best_classifier.fit(X_train, y_train)

# Predict using the best classifier
prediction = best_classifier.predict(new_person_data)

# Print the prediction
if prediction[0] == 1:
    print("Prediction: The person is likely to have diabetes.")
else:
    print("Prediction: The person is likely not to have diabetes.")

  new_person_data.loc[0, 'HbA1c_level'] = 6.5  # Replace with the actual HbA1c level


Prediction: The person is likely not to have diabetes.
