In [1]:
# Cell 1: Verify and Preprocess New Dataset
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

# Define dataset path
dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_dataset.csv"

# Verify file exists
if os.path.exists(dataset_path):
    print(f"Dataset found at: {dataset_path}")
    
    # Load the dataset
    df = pd.read_csv(dataset_path)
    print(f"\nDataset Shape: {df.shape}")
    
    # Check for missing values (NaN)
    print("\nMissing Values (NaN):")
    print(df.isnull().sum())
    
    # Check for potential missing data (e.g., "No Info" in smoking_history)
    print("\nUnique Values in Smoking History:")
    print(df['smoking_history'].value_counts())
    
    # Inspect data types
    print("\nData Types:")
    print(df.dtypes)
    
    # Handle categorical variables
    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])
    print("\nGender Encoding:", dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_))))
    
    le_smoking = LabelEncoder()
    df['smoking_history'] = le_smoking.fit_transform(df['smoking_history'])
    print("Smoking History Encoding:", dict(zip(le_smoking.classes_, le_smoking.transform(le_smoking.classes_))))
    
    # Save preprocessed dataset
    preprocessed_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed.csv"
    df.to_csv(preprocessed_path, index=False)
    print(f"\nSaved preprocessed dataset to: {preprocessed_path}")
    
    # Preview updated dataset
    print("\nPreprocessed Dataset Preview (First 5 Rows):")
    print(df.head())
else:
    print("Dataset not found at:", dataset_path)
# Expected Output (example, based on sample data):
# Dataset found at: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_dataset.csv
#
# Dataset Shape: (100000+, 9)
#
# Missing Values (NaN):
# gender                 0
# age                    0
# hypertension           0
# heart_disease          0
# smoking_history        0
# bmi                    0
# HbA1c_level            0
# blood_glucose_level    0
# diabetes               0
# dtype: int64
#
# Unique Values in Smoking History:
# No Info       35816
# never         35095
# ever           9114
# former         9188
# current        9286
# not current    7501
# Name: smoking_history, dtype: int64
#
# Data Types:
# gender                  int64
# age                   float64
# hypertension            int64
# heart_disease           int64
# smoking_history        object
# bmi                   float64
# HbA1c_level           float64
# blood_glucose_level     int64
# diabetes                int64
# dtype: object
#
# Gender Encoding: {'Female': 0, 'Male': 1}
# Smoking History Encoding: {'No Info': 0, 'current': 1, 'ever': 2, 'former': 3, 'never': 4, 'not current': 5}
#
# Saved preprocessed dataset to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed.csv
#
# Preprocessed Dataset Preview (First 5 Rows):
#    gender   age  hypertension  heart_disease  smoking_history    bmi  HbA1c_level  blood_glucose_level  diabetes
# 0       0  80.0             0              1               4  25.19          6.6                140         0
# 1       0  54.0             0              0               0  27.32          6.6                 80         0
# 2       1  28.0             0              0               4  27.32          5.7                158         0
# 3       0  36.0             0              0               1  23.45          5.0                155         0
# 4       1  76.0             1              1               1  20.14          4.8                155         0

Dataset found at: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_dataset.csv

Dataset Shape: (100000, 9)

Missing Values (NaN):
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Unique Values in Smoking History:
smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

Data Types:
gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

Gender Encoding: {'Female': np.int64(0), 'Male': np.int64(1), 'Other': np.int64(2)}
Smoking Hist

In [1]:
# Cell 1: Add Demographic Column (AgeGroup) to New Dataset
import pandas as pd
import os

# Load preprocessed dataset
preprocessed_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed.csv"
df = pd.read_csv(preprocessed_path)

# Bin age into groups (Young <40, Older >=40)
df['AgeGroup'] = pd.cut(df['age'], bins=[0, 40, 100], labels=['Young (<40)', 'Older (>=40)'], right=False)
print("Age Group Distribution:")
print(df['AgeGroup'].value_counts())

# Save updated dataset with AgeGroup
updated_dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv"
df.to_csv(updated_dataset_path, index=False)
print(f"\nSaved dataset with demographics to: {updated_dataset_path}")

# Preview updated dataset
print("\nUpdated Dataset Preview (First 5 Rows):")
print(df[['gender', 'age', 'AgeGroup', 'diabetes']].head())
# Expected Output (example, based on sample data):
# Age Group Distribution:
# Young (<40)     65000
# Older (>=40)    35000
# Name: AgeGroup, dtype: int64
#
# Saved dataset with demographics to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv
#
# Updated Dataset Preview (First 5 Rows):
#    gender   age      AgeGroup  diabetes
# 0       0  80.0  Older (>=40)         0
# 1       0  54.0  Older (>=40)         0
# 2       1  28.0   Young (<40)         0
# 3       0  36.0   Young (<40)         0
# 4       1  76.0  Older (>=40)         0

Age Group Distribution:
AgeGroup
Older (>=40)    54513
Young (<40)     45487
Name: count, dtype: int64

Saved dataset with demographics to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv

Updated Dataset Preview (First 5 Rows):
   gender   age      AgeGroup  diabetes
0       0  80.0  Older (>=40)         0
1       0  54.0  Older (>=40)         0
2       1  28.0   Young (<40)         0
3       0  36.0   Young (<40)         0
4       1  76.0  Older (>=40)         0


In [2]:
# Cell 1: Data Visualization and Correlation Matrix
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset with demographics
dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv"
df = pd.read_csv(dataset_path)

# Select key numerical features for visualization
features_to_visualize = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']

# Create histograms for each feature
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_visualize, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data=df, x=feature, hue='diabetes', multiple='stack', bins=20)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
plt.tight_layout()
plt.savefig("/Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_distributions.png")
plt.close()
print("Saved feature distributions plot to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_distributions.png")

# Create box plots for each feature
plt.figure(figsize=(12, 6))
for i, feature in enumerate(features_to_visualize[:-1], 1):  # Exclude diabetes from box plots
    plt.subplot(2, 2, i)
    sns.boxplot(data=df, x='diabetes', y=feature)
    plt.title(f'{feature} by Diabetes')
    plt.xlabel('Diabetes (0 = No, 1 = Yes)')
    plt.ylabel(feature)
plt.tight_layout()
plt.savefig("/Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_boxplots.png")
plt.close()
print("Saved feature box plots to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_boxplots.png")

# Calculate correlation matrix for numerical features
correlation_matrix = df[features_to_visualize].corr()

# Create heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Matrix of Features with Diabetes')
plt.savefig("/Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/correlation_matrix.png")
plt.close()
print("Saved correlation matrix to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/correlation_matrix.png")

# Display a sample of the correlation matrix
print("\nCorrelation Matrix Sample:")
print(correlation_matrix)
# Expected Output (example, values will vary):
# Saved feature distributions plot to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_distributions.png
# Saved feature box plots to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_boxplots.png
# Saved correlation matrix to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/correlation_matrix.png
#
# Correlation Matrix Sample:
#                     age       bmi  HbA1c_level  blood_glucose_level  diabetes
# age                1.000000  0.123456    0.234567           0.345678  0.456789
# bmi                0.123456  1.000000    0.345678           0.456789  0.567890
# HbA1c_level        0.234567  0.345678    1.000000           0.567890  0.678901
# blood_glucose_level 0.345678  0.456789    0.567890           1.000000  0.789012
# diabetes           0.456789  0.567890    0.678901           0.789012  1.000000

Saved feature distributions plot to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_distributions.png
Saved feature box plots to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/feature_boxplots.png
Saved correlation matrix to: /Users/dereddylikhith/Desktop/AI_Bias_Project/visuals/correlation_matrix.png

Correlation Matrix Sample:
                          age       bmi  HbA1c_level  blood_glucose_level  \
age                  1.000000  0.337396     0.101354             0.110672   
bmi                  0.337396  1.000000     0.082997             0.091261   
HbA1c_level          0.101354  0.082997     1.000000             0.166733   
blood_glucose_level  0.110672  0.091261     0.166733             1.000000   
diabetes             0.258008  0.214357     0.400660             0.419558   

                     diabetes  
age                  0.258008  
bmi                  0.214357  
HbA1c_level          0.400660  
blood_glucose_level  0.419558  
diabetes             1.

In [6]:
# Cell 1: Interpret Visualizations and Correlation Matrix with Gemini
import pandas as pd
import os
import google.generativeai as genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

# Load dataset to regenerate correlation matrix (for accuracy)
dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv"
df = pd.read_csv(dataset_path)
correlation_matrix = df[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']].corr()

# Prepare prompt with correlation matrix and expected visualization insights
prompt = f"""
You are an AI assistant interpreting data visualizations and a correlation matrix for a non-technical audience. Use simple language.

**Dataset Context:**
- We are analyzing a diabetes prediction dataset with 100,000 people.
- Features: age, bmi (Body Mass Index), HbA1c_level (blood sugar average), blood_glucose_level (current blood sugar).
- Target: diabetes (0 = No, 1 = Yes).
- Age is split into Young (<40) and Older (>=40) groups.

**Correlation Matrix:**
{correlation_matrix.to_string()}

**Expected Visualization Patterns:**
- Histograms: age might show two peaks (Young <40, Older >=40), bmi might be right-skewed with outliers, HbA1c_level and blood_glucose_level might be skewed right for diabetes=1 cases.
- Box Plots: Higher medians for HbA1c_level and blood_glucose_level for diabetes=1, possible outliers in bmi for both diabetes=0 and 1.

**Task:**
1. Explain what the correlation matrix tells us about which features are most related to diabetes.
2. Describe what the expected histogram and box plot patterns suggest about the data.
3. Recommend which features to prioritize for training a model and any preprocessing steps (e.g., handling outliers) based on the correlations and visualizations.
"""

# Use Gemini API to generate interpretation
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content(prompt)
print("LLM Interpretation of Visualizations and Correlation Matrix:")
print(response.text)
# Expected Output (based on correlation matrix from Step 46: age 0.258008, bmi 0.214357, HbA1c_level 0.400660, blood_glucose_level 0.419558):
# LLM Interpretation of Visualizations and Correlation Matrix:
# Hi! I’m here to help you understand the pictures and numbers from our big diabetes game with 100,000 players!
#
# **What the Friendship Chart (Correlation Matrix) Tells Us:**
# - The friendship chart shows how much each game card (feature) helps us find the special sticker (diabetes).
# - **blood_glucose_level** (0.4196) and **HbA1c_level** (0.4007) are the best friends with diabetes—they’re the strongest clues to guess who has it!
# - **age** (0.2580) and **bmi** (0.2144) are weaker friends, meaning they help a little but not as much.
# - The cards don’t copy each other too much (like age and bmi at 0.3374), so we can use them all without getting mixed up.
#
# **What the Pictures (Histograms and Box Plots) Suggest:**
# - The age picture might show two big groups: lots of players under 40 and lots over 40, which fits our Young and Older teams.
# - The bmi picture might lean toward bigger numbers with some players having really high or low weights (outliers), showing some unusual cases.
# - The HbA1c_level and blood_glucose_level pictures might lean toward higher numbers for players with diabetes, and the box plots will show their averages are higher when they have diabetes.
#
# **What to Do Next:**
# - Use **blood_glucose_level** and **HbA1c_level** as the main clues for teaching the robot (model) because they’re the best friends with diabetes.
# - Watch out for the bmi outliers—some players have really big or small weights. We might need to set a limit (like keeping bmi between 15 and 50) to help the robot focus.
# - age and bmi can be extra helpers, but they’re not as strong, so let’s use them too but not rely on them too much.
# - Get the robot ready to learn with these clues!



LLM Interpretation of Visualizations and Correlation Matrix:
Let's break down this diabetes data analysis.  We're looking at how age, BMI, average blood sugar (HbA1c_level), and current blood sugar (blood_glucose_level) relate to whether someone has diabetes.

1. **What the Correlation Matrix Tells Us:**

The correlation matrix shows how strongly different things are related.  A number close to 1 means a strong positive relationship (as one goes up, the other goes up). A number close to -1 means a strong negative relationship (as one goes up, the other goes down).  A number close to 0 means a weak or no relationship.

Looking at the last column ("diabetes"), we see:

* **Blood glucose level** shows the strongest positive correlation (0.42) with diabetes.  This means higher blood glucose levels are associated with a higher chance of having diabetes.
* **HbA1c level** also has a strong positive correlation (0.40) with diabetes.  This is expected, as HbA1c reflects average blood sugar ove

In [7]:
# Cell 1: Preprocess Dataset Based on LLM Recommendations
import pandas as pd
import os
import numpy as np
from scipy import stats

# Load dataset with demographics
dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_with_demographics.csv"
df = pd.read_csv(dataset_path)

# Handle bmi outliers (cap between 15 and 50)
df['bmi'] = df['bmi'].clip(lower=15, upper=50)
print("BMI after outlier capping - Min:", df['bmi'].min(), "Max:", df['bmi'].max())

# Apply logarithmic transformation to right-skewed variables
# Add a small constant to avoid log(0) and ensure positive values
df['bmi_log'] = np.log(df['bmi'] + 1)
df['HbA1c_level_log'] = np.log(df['HbA1c_level'] + 1)
df['blood_glucose_level_log'] = np.log(df['blood_glucose_level'] + 1)

# Optional: Verify transformation with skewness (closer to 0 is better)
print("\nSkewness After Log Transformation:")
print("bmi_log skewness:", df['bmi_log'].skew())
print("HbA1c_level_log skewness:", df['HbA1c_level_log'].skew())
print("blood_glucose_level_log skewness:", df['blood_glucose_level_log'].skew())

# Drop original skewed columns (optional, keep for reference if needed)
df = df.drop(columns=['bmi', 'HbA1c_level', 'blood_glucose_level'])

# Save preprocessed dataset
preprocessed_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed_transformed.csv"
df.to_csv(preprocessed_path, index=False)
print(f"\nSaved preprocessed dataset with transformations to: {preprocessed_path}")

# Preview updated dataset
print("\nPreprocessed Dataset Preview (First 5 Rows):")
print(df.head())
# Expected Output (example, values will vary):
# BMI after outlier capping - Min: 15 Max: 50
#
# Skewness After Log Transformation:
# bmi_log skewness: 0.123
# HbA1c_level_log skewness: 0.089
# blood_glucose_level_log skewness: 0.105
#
# Saved preprocessed dataset with transformations to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed_transformed.csv
#
# Preprocessed Dataset Preview (First 5 Rows):
#    gender   age  hypertension  heart_disease  smoking_history  diabetes  \
# 0       0  80.0             0              1                4         0   
# 1       0  54.0             0              0                0         0   
# 2       1  28.0             0              0                4         0   
# 3       0  36.0             0              0                1         0   
# 4       1  76.0             1              1                1         0   
#
#        AgeGroup  bmi_log  HbA1c_level_log  blood_glucose_level_log  
# 0  Older (>=40)  3.258097         2.022214                 4.948760  
# 1  Older (>=40)  3.332205         2.022214                 4.394449  
# 2   Young (<40)  3.332205         1.911609                 5.062595  
# 3   Young (<40)  3.196230         1.791759                 5.049856  
# 4  Older (>=40)  3.045023         1.740466                 5.049856  

BMI after outlier capping - Min: 15.0 Max: 50.0

Skewness After Log Transformation:
bmi_log skewness: -0.11404154093130534
HbA1c_level_log skewness: -0.5185292163640469
blood_glucose_level_log skewness: -0.0971701971912811

Saved preprocessed dataset with transformations to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed_transformed.csv

Preprocessed Dataset Preview (First 5 Rows):
   gender   age  hypertension  heart_disease  smoking_history  diabetes  \
0       0  80.0             0              1                4         0   
1       0  54.0             0              0                0         0   
2       1  28.0             0              0                4         0   
3       0  36.0             0              0                1         0   
4       1  76.0             1              1                1         0   

       AgeGroup   bmi_log  HbA1c_level_log  blood_glucose_level_log  
0  Older (>=40)  3.265378         2.028148           

In [8]:
# Cell 1: Train a Decision Tree Classifier on Preprocessed Dataset
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load transformed dataset
dataset_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/diabetes_prediction_preprocessed_transformed.csv"
df = pd.read_csv(dataset_path)

# Define features (exclude AgeGroup, gender, diabetes)
features = [col for col in df.columns if col not in ['AgeGroup', 'gender', 'diabetes']]
X = df[features]
y = df['diabetes']

# Split data into training and testing sets (80/20 split)
# Use a sample for efficiency due to large dataset size (100,000 rows)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Decision Tree classifier
model = DecisionTreeClassifier(random_state=42, max_depth=10)  # Limit depth to prevent overfitting
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print results
print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']))

# Save the test set with predictions for bias analysis
test_df = X_test.copy()
test_df['diabetes'] = y_test
test_df['Predicted_diabetes'] = y_pred
test_df['gender'] = df.loc[X_test.index, 'gender']
test_df['AgeGroup'] = df.loc[X_test.index, 'AgeGroup']
test_df.to_csv("/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/test_predictions_transformed.csv", index=False)
print("\nSaved test set with predictions to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/test_predictions_transformed.csv")
# Expected Output (example, metrics will vary due to transformations):
# Model Performance Metrics:
# Accuracy: 0.8600
# Precision: 0.7900
# Recall: 0.7300
#
# Classification Report:
#               precision    recall  f1-score   support
# No Diabetes       0.89      0.91      0.90     16000
#    Diabetes       0.79      0.73      0.76      4000
#
#    accuracy                           0.86     20000
#   macro avg       0.84      0.82      0.83     20000
# weighted avg       0.86      0.86      0.86     20000
#
# Saved test set with predictions to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/test_predictions_transformed.csv


Model Performance Metrics:
Accuracy: 0.9716
Precision: 0.9709
Recall: 0.6865

Classification Report:
              precision    recall  f1-score   support

 No Diabetes       0.97      1.00      0.98     18300
    Diabetes       0.97      0.69      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.97      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000


Saved test set with predictions to: /Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/test_predictions_transformed.csv


In [9]:
# Cell 1: Perform Bias Analysis with aif360 on New Dataset
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

# Load test set with predictions
test_path = "/Users/dereddylikhith/Desktop/AI_Bias_Project/datasets/test_predictions_transformed.csv"
df_test = pd.read_csv(test_path)

# Prepare dataset for aif360 (features + demographics + labels)
features = [col for col in df_test.columns if col not in ['gender', 'AgeGroup', 'diabetes', 'Predicted_diabetes']]
df_aif = df_test[features + ['gender', 'AgeGroup', 'diabetes', 'Predicted_diabetes']]

# Ensure categorical columns are encoded (already numeric from preprocessing, but verify)
le_gender = LabelEncoder()
df_aif['gender'] = le_gender.fit_transform(df_aif['gender'])
print("Gender Encoding:", dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_))))

le_agegroup = LabelEncoder()
df_aif['AgeGroup'] = le_agegroup.fit_transform(df_aif['AgeGroup'])
print("AgeGroup Encoding:", dict(zip(le_agegroup.classes_, le_agegroup.transform(le_agegroup.classes_))))

# Convert to BinaryLabelDataset for aif360 (gender: Male=1, Female=0, Other=2)
df_gender = BinaryLabelDataset(
    df=df_aif,
    label_names=['Predicted_diabetes'],
    protected_attribute_names=['gender'],
    favorable_label=1,  # Favorable outcome: Predicted as 1 (Diabetes)
    unfavorable_label=0
)

# Define privileged and unprivileged groups for gender
# Assuming Male as privileged (common in some bias studies, adjust if needed)
privileged_groups_gender = [{'gender': 1}]  # Male
unprivileged_groups_gender = [{'gender': 0}, {'gender': 2}]  # Female, Other

# Calculate fairness metrics for gender
metric_gender = BinaryLabelDatasetMetric(
    df_gender,
    privileged_groups=privileged_groups_gender,
    unprivileged_groups=unprivileged_groups_gender
)

print("\nFairness Metrics for Gender (Male vs. Female/Other):")
print(f"Disparate Impact: {metric_gender.disparate_impact():.4f}")
print(f"Statistical Parity Difference: {metric_gender.statistical_parity_difference():.4f}")

# Convert to BinaryLabelDataset for aif360 (AgeGroup: Young(<40)=0, Older(>=40)=1)
df_age = BinaryLabelDataset(
    df=df_aif,
    label_names=['Predicted_diabetes'],
    protected_attribute_names=['AgeGroup'],
    favorable_label=1,
    unfavorable_label=0
)

# Define privileged and unprivileged groups for AgeGroup
# Assuming Young as privileged (based on prior analysis), Older as unprivileged
privileged_groups_age = [{'AgeGroup': 0}]  # Young (<40)
unprivileged_groups_age = [{'AgeGroup': 1}]  # Older (>=40)

# Calculate fairness metrics for AgeGroup
metric_age = BinaryLabelDatasetMetric(
    df_age,
    privileged_groups=privileged_groups_age,
    unprivileged_groups=unprivileged_groups_age
)

print("\nFairness Metrics for AgeGroup (Young vs. Older):")
print(f"Disparate Impact: {metric_age.disparate_impact():.4f}")
print(f"Statistical Parity Difference: {metric_age.statistical_parity_difference():.4f}")

# Interpretation
print("\nInterpretation:")
print("Disparate Impact ~1.0 is ideal (no bias). <0.8 or >1.25 suggests bias.")
print("Statistical Parity Difference ~0.0 is ideal. Negative means unprivileged group (Female/Other or Older) is less favored.")
# Expected Output (example, values will vary):
# Gender Encoding: {0: 0, 1: 1, 2: 2}
# AgeGroup Encoding: {'Older (>=40)': 0, 'Young (<40)': 1}
#
# Fairness Metrics for Gender (Male vs. Female/Other):
# Disparate Impact: 0.9200
# Statistical Parity Difference: -0.0300
#
# Fairness Metrics for AgeGroup (Young vs. Older):
# Disparate Impact: 0.7500
# Statistical Parity Difference: -0.1200
#
# Interpretation:
# Disparate Impact ~1.0 is ideal (no bias). <0.8 or >1.25 suggests bias.
# Statistical Parity Difference ~0.0 is ideal. Negative means unprivileged group (Female/Other or Older) is less favored.

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


Gender Encoding: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2)}
AgeGroup Encoding: {'Older (>=40)': np.int64(0), 'Young (<40)': np.int64(1)}

Fairness Metrics for Gender (Male vs. Female/Other):
Disparate Impact: 0.7640
Statistical Parity Difference: -0.0165

Fairness Metrics for AgeGroup (Young vs. Older):
Disparate Impact: 0.1024
Statistical Parity Difference: -0.0913

Interpretation:
Disparate Impact ~1.0 is ideal (no bias). <0.8 or >1.25 suggests bias.
Statistical Parity Difference ~0.0 is ideal. Negative means unprivileged group (Female/Other or Older) is less favored.
