# A. Misspecification Metrics Dmochowski Datasets

#1. Magic

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import warnings
from scipy.stats import chi2
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.stats.outliers_influence import variance_inflation_factor

print("-----------------------------------------------------------------")
print("\n")
print("Dmochowski DATASET 1: MAGIC")
print("\n")

def summary_statistics(data):
    """
    Compute general summary statistics for the dataset.

    Parameters:
    data (DataFrame): Input dataframe.

    Returns:
    summary_df (DataFrame): Summary statistics dataframe.
    """
    summary_df = data.describe(include='all').transpose()
    summary_df['missing_values'] = data.isnull().sum()
    return summary_df


def hosmer_lemeshow_test(y_true, y_pred_proba, group_bins=10):
    """
    Perform the Hosmer-Lemeshow goodness of fit test for logistic regression.

    :param y_true: The true binary labels.
    :param y_pred_proba: The predicted probabilities from the logistic regression.
    :param group_bins: The number of bins to use for calculating the test statistic, default is 10.
    :return: The test statistic and the p-value.
    """
    # Bin the data by predicted probabilities
    bin_cutoffs = np.linspace(0, 1, group_bins + 1)
    bins = np.digitize(y_pred_proba, bin_cutoffs, right=True)
    bin_counts = np.bincount(bins, minlength=len(bin_cutoffs))

    # Calculate observed and expected event rates per bin
    obs_events_per_bin = np.bincount(bins, weights=y_true, minlength=len(bin_cutoffs))
    exp_events_per_bin = np.bincount(bins, weights=y_pred_proba, minlength=len(bin_cutoffs))

    # Remove the first bin which is for probabilities less than the lowest cutoff
    obs_events_per_bin = obs_events_per_bin[1:]
    exp_events_per_bin = exp_events_per_bin[1:]
    bin_counts = bin_counts[1:]

    # Avoid division by zero and ensure nonzero count for chi-squared calculation
    non_zero = (bin_counts > 0) & (exp_events_per_bin > 0)
    obs_events_per_bin = obs_events_per_bin[non_zero]
    exp_events_per_bin = exp_events_per_bin[non_zero]
    bin_counts = bin_counts[non_zero]

    # Calculate the test statistic
    hl_stat = np.sum((obs_events_per_bin - exp_events_per_bin) ** 2 / (exp_events_per_bin * (1 - exp_events_per_bin / bin_counts)))

    # Degrees of freedom usually equals group_bins - 2
    df = group_bins - 2
    p_value = 1 - chi2.cdf(hl_stat, df)

    return hl_stat, p_value

def link_test_with_vif_reduction(model, X, y):
    # Ensure a constant term is included for the prediction
    X_const = sm.add_constant(X, has_constant='add')
    y_pred = model.predict(X_const)

    # Ensure no duplication with the constant term
    X_no_const = X  # Remove constant term added earlier is not needed here
    # Add predicted values and their squared terms to the features
    X_link_test = np.column_stack((X_no_const, y_pred, y_pred**2))

    # Check VIF and remove high VIF columns
    high_vif = True
    while high_vif:
        vif_data = pd.DataFrame()
        vif_data["VIF"] = [variance_inflation_factor(X_link_test, i) for i in range(X_link_test.shape[1])]
        high_vif_columns = vif_data[vif_data['VIF'] > 10].index.tolist()  # Threshold for VIF can be adjusted
        if high_vif_columns:
            print(f"Removing high VIF column: {high_vif_columns[0]}")
            X_link_test = np.delete(X_link_test, high_vif_columns[0], axis=1)
        else:
            high_vif = False

    X_link_test = sm.add_constant(X_link_test)  # Re-add constant to ensure model specification
    # Fit the new model with additional terms
    link_model = sm.Logit(y, X_link_test).fit(disp=0)
    return link_model.summary()



# Load the dataset
data_path = '/content/magic04.data'
columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
data_magic04 = pd.read_csv(data_path, header=None, names=columns)

# Encode the categorical target variable
label_encoder = LabelEncoder()
data_magic04['class_encoded'] = label_encoder.fit_transform(data_magic04['class'])

# Select features and target for analysis
X = data_magic04.drop(columns=['class', 'class_encoded'])
y = data_magic04['class_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit the logistic regression model using statsmodels
warnings.simplefilter('ignore', ConvergenceWarning)
try:
    X_train_const = sm.add_constant(X_train)  # Adding a constant column for bias
    logit_model = sm.Logit(y_train, X_train_const).fit_regularized(method='l1', disp=0)
    print(logit_model.summary())
except Exception as e:
    print(f"An error occurred: {e}")

#Summary Statistics
print("-----------------------------------------------------------------")
print("\n")
print("General Summary Statistics:")
print(summary_statistics(data_magic04))
print("\n")

print("-----------------------------------------------------------------")
print("\n")

print("A. Overall performance Metric (1 metric):\n")

# Model evaluation using ROC AUC
y_pred_proba = logit_model.predict(sm.add_constant(X_test))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc}")

print("-----------------------------------------------------------------")
print("\n")
print("B. Model misspecification metrics:\n")

"""
    Hosmer-Lemeshow Test: Evaluates the fit of the model across deciles of risk, comparing observed and expected frequencies.
    AIC (Akaike's Information Criterion): Assesses model fit while penalizing for the number of parameters, aiding in model selection.
    BIC (Bayesian Information Criterion): Similar to AIC but with a stronger penalty for model complexity, useful in model selection.
    Link Test: Diagnoses misspecification of the functional form, checking if the model is correctly specified.
"""

# The Hosmer-Lemeshow test function is as provided
hl_stat, p_value = hosmer_lemeshow_test(y_test, y_pred_proba)
print(f"\t2) Hosmer-Lemeshow Test Statistic: {hl_stat}, P-value: {p_value}\n")
print(f"\t3) AIC: {logit_model.aic}, \n\t4) BIC: {logit_model.bic}\n")

# Use the function for the Link Test
link_test_result = link_test_with_vif_reduction(logit_model, X_test, y_test)
print(f"\n\t5) Link Test Result: {link_test_result}")  # Print the link_test_result)

print("-----------------------------------------------------------------")



-----------------------------------------------------------------


Dmochowski DATASET 1: MAGIC


                           Logit Regression Results                           
Dep. Variable:          class_encoded   No. Observations:                15216
Model:                          Logit   Df Residuals:                    15205
Method:                           MLE   Df Model:                           10
Date:                Wed, 24 Apr 2024   Pseudo R-squ.:                  0.2918
Time:                        09:27:28   Log-Likelihood:                -6984.9
converged:                       True   LL-Null:                       -9862.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.6429      0.022    -29.089      0.000      -0.686      -0.600
x1             1.2188      0.050 

#2. Adult

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
import statsmodels.api as sm
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2

print("-----------------------------------------------------------------")
print("\n")
print("Dmochowski DATASET 2: ADULT")
print("\n")

# Load the Adult dataset
data_path_adult = '/content/adult.data'
data_adult = pd.read_csv(data_path_adult, header=None)

# Assuming the dataset has no header and assigning column names
columns_adult = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]
data_adult.columns = columns_adult

# Handle missing values if necessary (replace '?' with NaN and then impute/drop)
data_adult.replace('?', np.nan, inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']:
    le = LabelEncoder()
    data_adult[column] = le.fit_transform(data_adult[column].astype(str))
    label_encoders[column] = le

# Encode target variable
label_encoder = LabelEncoder()
data_adult['income_encoded'] = label_encoder.fit_transform(data_adult['income'])

# Select features and target variable
X = data_adult.drop(['income', 'income_encoded'], axis=1)
y = data_adult['income_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression Model
X_train_const = sm.add_constant(X_train)  # Adding a constant column for bias
warnings.simplefilter('ignore', ConvergenceWarning)
try:
    logit_model = sm.Logit(y_train, X_train_const).fit_regularized(method='l1')
    print(logit_model.summary())
except Exception as e:
    print(f"An error occurred: {e}")

#Summary Statistics
print("-----------------------------------------------------------------")
print("\n")
print("General Summary Statistics:")
print(summary_statistics(data_adult))
print("\n")

print("-----------------------------------------------------------------")
print("\n")

print("A. Overall performance Metric (1 metric):\n")

# Model evaluation using ROC AUC
y_pred_proba = logit_model.predict(sm.add_constant(X_test))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc}")

print("-----------------------------------------------------------------")
print("\n")
print("B. Model misspecification metrics:\n")

"""
    Hosmer-Lemeshow Test: Evaluates the fit of the model across deciles of risk, comparing observed and expected frequencies.
    AIC (Akaike's Information Criterion): Assesses model fit while penalizing for the number of parameters, aiding in model selection.
    BIC (Bayesian Information Criterion): Similar to AIC but with a stronger penalty for model complexity, useful in model selection.
    Link Test: Diagnoses misspecification of the functional form, checking if the model is correctly specified.
"""

# The Hosmer-Lemeshow test function is as provided
hl_stat, p_value = hosmer_lemeshow_test(y_test, y_pred_proba)
print(f"\t2) Hosmer-Lemeshow Test Statistic: {hl_stat}, P-value: {p_value}\n")
print(f"\t3) AIC: {logit_model.aic}, \n\t4) BIC: {logit_model.bic}\n")

# Use the function for the Link Test
link_test_result = link_test_with_vif_reduction(logit_model, X_test, y_test)
print(f"\n\t5) Link Test Result: {link_test_result}")  # Print the link_test_result)

print("-----------------------------------------------------------------")



-----------------------------------------------------------------


Dmochowski DATASET 2: ADULT


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.38432559688086076
            Iterations: 43
            Function evaluations: 43
            Gradient evaluations: 43
                           Logit Regression Results                           
Dep. Variable:         income_encoded   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26033
Method:                           MLE   Df Model:                           14
Date:                Wed, 24 Apr 2024   Pseudo R-squ.:                  0.3036
Time:                        09:28:00   Log-Likelihood:                -10011.
converged:                       True   LL-Null:                       -14376.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|     

#3. Haberman

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
import statsmodels.api as sm
import warnings
from scipy.stats import chi2
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.stats.outliers_influence import variance_inflation_factor

print("-----------------------------------------------------------------")
print("\n")
print("Dmochowski DATASET 3: HABERMAN")
print("\n")

# Load the Haberman dataset
data_path_haberman = '/content/haberman.data'
columns_haberman = ['age', 'operation_year', 'positive_nodes', 'survival_status']
data_haberman = pd.read_csv(data_path_haberman, header=None, names=columns_haberman)

# Convert survival status to binary (1: survived 5 years or longer, 2: died within 5 years)
# Assuming '1' is survival, '2' is death, and we need '1' as survived (positive class) and '0' as died
data_haberman['survival_status'] = data_haberman['survival_status'].apply(lambda x: 1 if x == 1 else 0)

# Encode categorical variables if needed, here only 'operation_year' might be considered if too varied
# For simplicity, we treat all features as numeric in this case

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(data_haberman.drop('survival_status', axis=1))
y = data_haberman['survival_status']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
X_train_const = sm.add_constant(X_train)
warnings.simplefilter('ignore', ConvergenceWarning)
try:
    logit_model = sm.Logit(y_train, X_train_const).fit()
    print(logit_model.summary())
except Exception as e:
    print(f"An error occurred: {e}")

#Summary Statistics
print("-----------------------------------------------------------------")
print("\n")
print("General Summary Statistics:")
print(summary_statistics(data_haberman))
print("\n")

print("-----------------------------------------------------------------")
print("\n")

print("A. Overall performance Metric (1 metric):\n")

# Model evaluation using ROC AUC
y_pred_proba = logit_model.predict(sm.add_constant(X_test))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc}")

print("-----------------------------------------------------------------")
print("\n")
print("B. Model misspecification metrics:\n")

"""
    Hosmer-Lemeshow Test: Evaluates the fit of the model across deciles of risk, comparing observed and expected frequencies.
    AIC (Akaike's Information Criterion): Assesses model fit while penalizing for the number of parameters, aiding in model selection.
    BIC (Bayesian Information Criterion): Similar to AIC but with a stronger penalty for model complexity, useful in model selection.
    Link Test: Diagnoses misspecification of the functional form, checking if the model is correctly specified.
"""

# The Hosmer-Lemeshow test function is as provided
hl_stat, p_value = hosmer_lemeshow_test(y_test, y_pred_proba)
print(f"\t2) Hosmer-Lemeshow Test Statistic: {hl_stat}, P-value: {p_value}\n")
print(f"\t3) AIC: {logit_model.aic}, \n\t4) BIC: {logit_model.bic}\n")

# Use the function for the Link Test
link_test_result = link_test_with_vif_reduction(logit_model, X_test, y_test)
print(f"\n\t5) Link Test Result: {link_test_result}")  # Print the link_test_result)

print("-----------------------------------------------------------------")

-----------------------------------------------------------------


Dmochowski DATASET 3: HABERMAN


Optimization terminated successfully.
         Current function value: 0.514452
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:        survival_status   No. Observations:                  244
Model:                          Logit   Df Residuals:                      240
Method:                           MLE   Df Model:                            3
Date:                Wed, 24 Apr 2024   Pseudo R-squ.:                 0.09929
Time:                        09:28:21   Log-Likelihood:                -125.53
converged:                       True   LL-Null:                       -139.36
Covariance Type:            nonrobust   LLR p-value:                 4.250e-06
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const  

#4. Transfusion

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import warnings
from scipy.stats import chi2
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.stats.outliers_influence import variance_inflation_factor

print("-----------------------------------------------------------------")
print("\n")
print("Dmochowski DATASET 4: TRANSFUSION")
print("\n")

# Load the transfusion dataset
data_path_transfusion = '/content/transfusion.data'
data_transfusion = pd.read_csv(data_path_transfusion)

# Check the first few rows to understand the dataset structure
print(data_transfusion.head())

# Assume the target variable is the last column and all others are features
X = data_transfusion.iloc[:, :-1]
y = data_transfusion.iloc[:, -1]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Adding a constant column for statsmodels
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# Fit the logistic regression model using statsmodels
warnings.simplefilter('ignore', ConvergenceWarning)
try:
    logit_model = sm.Logit(y_train, X_train_const).fit()
    print(logit_model.summary())
except Exception as e:
    print(f"An error occurred: {e}")


#Summary Statistics
print("-----------------------------------------------------------------")
print("\n")
print("General Summary Statistics:")
print(summary_statistics(data_transfusion))
print("\n")

print("-----------------------------------------------------------------")
print("\n")

print("A. Overall performance Metric (1 metric):\n")

# Model evaluation using ROC AUC
y_pred_proba = logit_model.predict(sm.add_constant(X_test))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc}")

print("-----------------------------------------------------------------")
print("\n")
print("B. Model misspecification metrics:\n")

"""
    Hosmer-Lemeshow Test: Evaluates the fit of the model across deciles of risk, comparing observed and expected frequencies.
    AIC (Akaike's Information Criterion): Assesses model fit while penalizing for the number of parameters, aiding in model selection.
    BIC (Bayesian Information Criterion): Similar to AIC but with a stronger penalty for model complexity, useful in model selection.
    Link Test: Diagnoses misspecification of the functional form, checking if the model is correctly specified.
"""

# The Hosmer-Lemeshow test function is as provided
hl_stat, p_value = hosmer_lemeshow_test(y_test, y_pred_proba)
print(f"\t2) Hosmer-Lemeshow Test Statistic: {hl_stat}, P-value: {p_value}\n")
print(f"\t3) AIC: {logit_model.aic}, \n\t4) BIC: {logit_model.bic}\n")

# Use the function for the Link Test
link_test_result = link_test_with_vif_reduction(logit_model, X_test, y_test)
print(f"\n\t5) Link Test Result: {link_test_result}")  # Print the link_test_result)

print("-----------------------------------------------------------------")

-----------------------------------------------------------------


Dmochowski DATASET 4: TRANSFUSION


   Recency (months)  Frequency (times)  Monetary (c.c. blood)  Time (months)  \
0                 2                 50                  12500             98   
1                 0                 13                   3250             28   
2                 1                 16                   4000             35   
3                 2                 20                   5000             45   
4                 1                 24                   6000             77   

   whether he/she donated blood in March 2007  
0                                           1  
1                                           1  
2                                           1  
3                                           1  
4                                           0  
Optimization terminated successfully.
         Current function value: 0.470528
         Iterations 11
                        

  vif = 1. / (1. - r_squared_i)
