In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [42]:

# Generate synthetic data
np.random.seed(0)
n_samples = 1000
data = pd.DataFrame({
    'Sex': np.random.randint(0, 2, n_samples),  # 0: Male, 1: Female
    'Age': np.random.randint(18, 65, n_samples),
    'Income': np.random.randint(20000, 100000, n_samples),
    'Employed': np.random.randint(0, 2, n_samples), # 0: No, 1: Yes
    'GoodPayer': np.random.randint(0, 2, n_samples)  # 0: Bad payer, 1: Good payer
})

# Introduce some bias (example: slightly more women are good payers)
for i in range(n_samples):
    if data['Sex'][i] == 1 and np.random.rand() < 0.1: # 10% chance for women to become good payers
        data['GoodPayer'][i] = 1

# Split data
X = data.drop('GoodPayer', axis=1)
y = data['GoodPayer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data['GoodPayer'][i] = 1


In [43]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")



Model Accuracy: 0.46


In [44]:
data['predictions'] = model.predict(X)

In [45]:
def calculate_conditional_probability(df, sex_value):
    """Calculates Pr{Ŷ = 1 | A = sex_value, Y = 1}."""
    subset = df[(df['Sex'] == sex_value) & (df['True_Y'] == 1)]
    if subset.empty:
        return 0.0  # Handle empty subsets
    predicted_ones = subset['Predicted_Y'].sum()
    probability = predicted_ones / len(subset)
    return probability

def calculate_conditional_probabilities_groupby(df):
    """Calculates all conditional probabilities using groupby and pivot_table."""
    df_grouped = df[df['True_Y']==1].groupby(['Sex'])['Predicted_Y'].mean()
    return df_grouped

def calculate_conditional_probabilities_crosstab(df):
    """Calculates all conditional probabilities using crosstab."""
    return pd.crosstab(df['Sex'], df['True_Y'], df['Predicted_Y'], aggfunc='mean').loc[:,1]


# Calculate and print the conditional probabilities 
for sex_value in [0, 1]: # adjust [0, 1] according  the values of sex 
    probability = calculate_conditional_probability(df, sex_value)
    print(f"Pr{{Ŷ = 1 | A = {sex_value}, Y = 1}}: {probability:.3f}")

# Calculate and print the conditional probabilities (groupby and pivot_table)
probabilities_groupby = calculate_conditional_probabilities_groupby(df)
print("\nProbabilities using groupby:\n", probabilities_groupby)


Pr{Ŷ = 1 | A = 0, Y = 1}: 0.421
Pr{Ŷ = 1 | A = 1, Y = 1}: 0.480

Probabilities using groupby:
 Sex
0    0.421053
1    0.480000
Name: Predicted_Y, dtype: float64
