In [None]:
import pandas as pd
df = pd.read_csv(".\diabetes_dataset.csv")

In [19]:
df.columns

Index(['Unnamed: 0', 'Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference',
       'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic',
       'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL',
       'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level',
       'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status',
       'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes'],
      dtype='object')

In [20]:
df['Outcome'] = ((df['HbA1c'] >= 6.5) | (df['Fasting_Blood_Glucose'] >= 126)).astype(int)

In [21]:
df['Sex'] = df['Sex'].str.strip().str.capitalize()  # clean it first
df['Sex'] = df['Sex'].map({'Male': 1, 'Female': 0})

In [22]:

# Ensure the column is in string format and remove any unwanted whitespaces
df['Smoking_Status'] = df['Smoking_Status'].astype(str).str.strip().str.capitalize()

# Check the unique values
print(df['Smoking_Status'].unique())

df = pd.get_dummies(df, dtype = int , columns=['Smoking_Status'], prefix='Smoke', drop_first=True)


['Never' 'Current' 'Former']


In [23]:
df['Alcohol_Consumption'].value_counts()

Alcohol_Consumption
Moderate    3373
Heavy       3307
Name: count, dtype: int64

In [24]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     10000 non-null  int64  
 1   Age                            10000 non-null  int64  
 2   Sex                            10000 non-null  int64  
 3   Ethnicity                      10000 non-null  object 
 4   BMI                            10000 non-null  float64
 5   Waist_Circumference            10000 non-null  float64
 6   Fasting_Blood_Glucose          10000 non-null  float64
 7   HbA1c                          10000 non-null  float64
 8   Blood_Pressure_Systolic        10000 non-null  int64  
 9   Blood_Pressure_Diastolic       10000 non-null  int64  
 10  Cholesterol_Total              10000 non-null  float64
 11  Cholesterol_HDL                10000 non-null  float64
 12  Cholesterol_LDL                10000 non-null  

In [25]:
mode_value = df['Alcohol_Consumption'].mode()[0]
df['Alcohol_Consumption'] = df['Alcohol_Consumption'].fillna(mode_value)

In [26]:
# One-hot encoding for 'Alcohol_Consumption' with drop_first=True
df = pd.get_dummies(df, columns=['Alcohol_Consumption'], dtype=int , prefix='Alcohol', drop_first=True)


In [27]:
df = pd.get_dummies(df, columns=['Physical_Activity_Level'], prefix='Physical_Activity', dtype=int , drop_first=True)

In [28]:
df = df.drop(columns=['Unnamed: 0', 'Ethnicity'])

In [29]:
import pandas as pd

# Step 1: Define age bins and labels
age_bins = [0, 25, 45, 65, 100]  # These are the bin edges (you can adjust the ranges if needed)
age_labels = ['Young', 'Middle-Aged', 'Older', 'Senior']  # Labels for each bin

# Step 2: Create a new 'Age_Group' column based on the age bins
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)



In [30]:
df

Unnamed: 0,Age,Sex,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,...,Dietary_Intake_Calories,Family_History_of_Diabetes,Previous_Gestational_Diabetes,Outcome,Smoke_Former,Smoke_Never,Alcohol_Moderate,Physical_Activity_Low,Physical_Activity_Moderate,Age_Group
0,58,0,35.8,83.4,123.9,10.9,152,114,197.8,50.2,...,1538,0,1,1,0,1,1,0,1,Older
1,48,1,24.1,71.4,183.7,12.8,103,91,261.6,62.0,...,2653,0,1,1,0,0,1,0,1,Older
2,34,0,25.0,113.8,142.0,14.5,179,104,261.0,32.1,...,1684,1,0,1,1,0,0,1,0,Middle-Aged
3,62,1,32.7,100.4,167.4,8.8,176,118,183.4,41.1,...,3796,1,0,1,0,1,1,1,0,Older
4,27,0,33.5,110.8,146.4,7.1,122,97,203.2,53.9,...,3161,0,0,1,0,0,0,0,1,Middle-Aged
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,0,39.4,114.0,76.2,13.4,109,82,187.7,60.3,...,3582,1,1,1,1,0,1,1,0,Middle-Aged
9996,41,0,21.0,71.3,77.1,12.9,154,72,234.3,72.3,...,2206,1,1,1,0,1,1,1,0,Middle-Aged
9997,50,0,29.0,106.3,97.5,4.9,122,61,266.0,69.8,...,3175,1,1,0,1,0,0,0,0,Older
9998,62,0,27.3,119.9,89.0,11.5,99,115,172.3,74.2,...,3478,1,0,1,0,1,1,0,0,Older


In [31]:
df['Blood_Pressure_Systolic'].max()

179

In [32]:
df.columns

Index(['Age', 'Sex', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose',
       'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
       'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT',
       'Serum_Urate', 'Dietary_Intake_Calories', 'Family_History_of_Diabetes',
       'Previous_Gestational_Diabetes', 'Outcome', 'Smoke_Former',
       'Smoke_Never', 'Alcohol_Moderate', 'Physical_Activity_Low',
       'Physical_Activity_Moderate', 'Age_Group'],
      dtype='object')

In [33]:
df = pd.get_dummies(df, dtype = int , columns=['Age_Group'], prefix='Age', drop_first=True)


In [34]:
df.drop(columns=['Age'])

Unnamed: 0,Sex,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,...,Previous_Gestational_Diabetes,Outcome,Smoke_Former,Smoke_Never,Alcohol_Moderate,Physical_Activity_Low,Physical_Activity_Moderate,Age_Middle-Aged,Age_Older,Age_Senior
0,0,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,...,1,1,0,1,1,0,1,0,1,0
1,1,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,...,1,1,0,0,1,0,1,0,1,0
2,0,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,...,0,1,1,0,0,1,0,1,0,0
3,1,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,...,0,1,0,1,1,1,0,0,1,0
4,0,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,...,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,39.4,114.0,76.2,13.4,109,82,187.7,60.3,88.7,...,1,1,1,0,1,1,0,1,0,0
9996,0,21.0,71.3,77.1,12.9,154,72,234.3,72.3,200.0,...,1,1,0,1,1,1,0,1,0,0
9997,0,29.0,106.3,97.5,4.9,122,61,266.0,69.8,156.1,...,1,0,1,0,0,0,0,0,1,0
9998,0,27.3,119.9,89.0,11.5,99,115,172.3,74.2,110.9,...,0,1,0,1,1,0,0,0,1,0


In [35]:
df

Unnamed: 0,Age,Sex,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,...,Previous_Gestational_Diabetes,Outcome,Smoke_Former,Smoke_Never,Alcohol_Moderate,Physical_Activity_Low,Physical_Activity_Moderate,Age_Middle-Aged,Age_Older,Age_Senior
0,58,0,35.8,83.4,123.9,10.9,152,114,197.8,50.2,...,1,1,0,1,1,0,1,0,1,0
1,48,1,24.1,71.4,183.7,12.8,103,91,261.6,62.0,...,1,1,0,0,1,0,1,0,1,0
2,34,0,25.0,113.8,142.0,14.5,179,104,261.0,32.1,...,0,1,1,0,0,1,0,1,0,0
3,62,1,32.7,100.4,167.4,8.8,176,118,183.4,41.1,...,0,1,0,1,1,1,0,0,1,0
4,27,0,33.5,110.8,146.4,7.1,122,97,203.2,53.9,...,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,0,39.4,114.0,76.2,13.4,109,82,187.7,60.3,...,1,1,1,0,1,1,0,1,0,0
9996,41,0,21.0,71.3,77.1,12.9,154,72,234.3,72.3,...,1,1,0,1,1,1,0,1,0,0
9997,50,0,29.0,106.3,97.5,4.9,122,61,266.0,69.8,...,1,0,1,0,0,0,0,0,1,0
9998,62,0,27.3,119.9,89.0,11.5,99,115,172.3,74.2,...,0,1,0,1,1,0,0,0,1,0


In [36]:
df.drop(columns=['Age'] , inplace = True)

In [37]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your full dataset
X = df.drop('Outcome', axis=1)  # Drop the 'Outcome' column (target column) from X
y = df['Outcome']  # The 'Outcome' column is the target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")



Training data shape: (8000, 23)
Testing data shape: (2000, 23)


In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Train the Logistic Regression model on the scaled training data
log_reg_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg_model.predict(X_test)

# Evaluate the model
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 94.60%

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.61      0.67       181
           1       0.96      0.98      0.97      1819

    accuracy                           0.95      2000
   macro avg       0.85      0.79      0.82      2000
weighted avg       0.94      0.95      0.94      2000


Confusion Matrix:
[[ 110   71]
 [  37 1782]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
cols_to_scale = [
    'BMI',
    'Waist_Circumference',
    'Fasting_Blood_Glucose',
    'HbA1c',
    'Cholesterol_HDL',
    'Serum_Urate',
    'Blood_Pressure_Systolic',
    'Blood_Pressure_Diastolic',
    'Cholesterol_Total',
    'Dietary_Intake_Calories'
]

scaler = StandardScaler()

# Scale and update the DataFrame
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


In [40]:
df.columns

Index(['Sex', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c',
       'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
       'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT',
       'Serum_Urate', 'Dietary_Intake_Calories', 'Family_History_of_Diabetes',
       'Previous_Gestational_Diabetes', 'Outcome', 'Smoke_Former',
       'Smoke_Never', 'Alcohol_Moderate', 'Physical_Activity_Low',
       'Physical_Activity_Moderate', 'Age_Middle-Aged', 'Age_Older',
       'Age_Senior'],
      dtype='object')

In [41]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your full dataset
X = df.drop('Outcome', axis=1)  # Drop the 'Outcome' column (target column) from X
y = df['Outcome']  # The 'Outcome' column is the target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")



Training data shape: (8000, 23)
Testing data shape: (2000, 23)


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Train the Logistic Regression model on the scaled training data
log_reg_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg_model.predict(X_test)

# Evaluate the model
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 95.85%

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       181
           1       0.98      0.98      0.98      1819

    accuracy                           0.96      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.96      0.96      0.96      2000


Confusion Matrix:
[[ 139   42]
 [  41 1778]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
