In [1]:
import pandas as pd
import numpy as np

# Function to normalize based on category bounds
def normalize_category(value, bounds):
    if value >= bounds[0]:
        return 1.0
    elif value <= bounds[-1]:
        return 0.0
    for i, bound in enumerate(bounds):
        if value > bound:
            return (len(bounds) - 1 - i) / (len(bounds) - 1)
    return 0.0

# Define category bounds for the Hosmer-Lemeshow and Link Test P-values, and Pseudo R-squared
hosmer_lemeshow_bounds = [0.05, 0.01, 0.001, 0.0001, 0]
link_test_bounds = [0.05, 0.01, 0.001, 0.0001, 0]
pseudo_r_squared_bounds = [0.4, 0.3, 0.2, 0.1, 0]

# Updated data dictionary with pseudo R-squared and link test p-value for each dataset
data = {
    'Dataset': ['MAGIC', 'ADULT', 'HABERMAN', 'TRANSFUSION'],
    'Hosmer-Lemeshow P-value': [0.008555023891818947, 0.018650468284383104, 0.19664262393908305, 0.9180867891329492],
    'BIC': [14075.734568215932, 20174.341739137355, 273.0410347735347, 588.3260145415767],
    'Pseudo R-squared': [0.2918, 0.3036, 0.09929, 0.1385],
    'Link Test P-value': [5.485e-160, 1.735e-19, 0.3449, 0.0001366]
}

# Create DataFrame
df = pd.DataFrame(data)

# Normalize the metrics
df['Normalized Hosmer-Lemeshow'] = df['Hosmer-Lemeshow P-value'].apply(normalize_category, bounds=hosmer_lemeshow_bounds)
df['Normalized Link Test'] = df['Link Test P-value'].apply(normalize_category, bounds=link_test_bounds)
df['Normalized Pseudo R-squared'] = df['Pseudo R-squared'].apply(lambda x: normalize_category(x, pseudo_r_squared_bounds) if x > 0 else 0)
df['Normalized BIC'] = 1 - (df['BIC'] - df['BIC'].min()) / (df['BIC'].max() - df['BIC'].min())

# Weights as specified, ensuring correct order
weights = np.array([1, 0.9, 0.4, 0.2])  # Adjusted to match columns correctly
total_weight = weights.sum()

# Calculate the overall Model Specification Score
df['Overall Model Specification Score'] = (df[
    ['Normalized Hosmer-Lemeshow', 'Normalized BIC', 'Normalized Pseudo R-squared', 'Normalized Link Test']
].dot(weights) / total_weight) * 100

df.sort_values(by='Overall Model Specification Score', ascending=False, inplace=True)

# Define the categories based on the scores
def categorize(score):
    if score > 80:
        return 'Very well specified'
    elif 60 < score <= 80:
        return 'Well Specified'
    elif 40 < score <= 60:
        return 'Average'
    elif 15 < score <= 40:
        return 'Misspecified'
    else:
        return 'Badly misspecified'

# Apply categorization to the DataFrame
df['Category'] = df['Overall Model Specification Score'].apply(categorize)

# Explanation of model misspecification based on metrics
explanations = {
    'MAGIC': [
        'Low Hosmer-Lemeshow P-value indicates poor fit, suggesting possible model misspecification.',
        'Highly significant Link Test P-value indicates that the model might be omitting important predictors or using an incorrect functional form.',
        'Pseudo R-squared value indicates moderate explanatory power but still under the acceptable threshold for a good fit.',
        'Relatively high BIC score indicates potential overfitting or excessive complexity.'
    ],
    'ADULT': [
        'Low Hosmer-Lemeshow P-value suggests a poor fit, indicative of model misspecification.',
        'Significant Link Test P-value implies potential errors in model form or omitted variables.',
        'Pseudo R-squared indicates moderate explanatory power, which is generally acceptable but not ideal.',
        'High BIC score suggests significant overfitting or unnecessary complexity, indicating poor model specification.'
    ],
    'HABERMAN': [
        'Higher Hosmer-Lemeshow P-value indicates a better fit compared to other datasets, suggesting lesser misspecification.',
        'Non-significant Link Test P-value implies that the model form may be appropriate for the data.',
        'Low Pseudo R-squared value suggests limited explanatory power, indicating potential underfitting.',
        'Low BIC score suggests a more efficient model in terms of simplicity and fit.'
    ],
    'TRANSFUSION': [
        'High Hosmer-Lemeshow P-value indicates a good fit, suggesting minimal model misspecification.',
        'Significant Link Test P-value indicates potential errors in model specification.',
        'Pseudo R-squared is relatively low, indicating that the model explains only a small portion of the variance in the outcome.',
        'Moderate BIC score indicates a reasonable balance between model complexity and fit.'
    ]
}

# Print DataFrame for overview
print(df[['Dataset', 'Overall Model Specification Score', 'Category']])

# Prepare a DataFrame to hold categorizations, Model Specification Scores, and explanations
category_explanations = pd.DataFrame(
    columns=['Model Specification Score', 'Badly misspecified', 'Misspecified', 'Average',
             'Well Specified', 'Very well specified'],
    index=df['Dataset']
)

# Insert the 'Model Specification Score' from df to category_explanations
category_explanations['Model Specification Score'] = df['Overall Model Specification Score'].values

# Format the 'Model Specification Score' to display only two decimal places
category_explanations['Model Specification Score'] = category_explanations['Model Specification Score'].map(lambda x: f"{x:.2f}")

# Map each dataset to its explanation
for dataset in df['Dataset']:
    category = df[df['Dataset'] == dataset]['Category'].values[0]
    explanations_list = explanations[dataset]  # Corrected to use the dictionary
    category_explanations.at[dataset, category] = "\n".join(explanations_list)

# Save the DataFrame to a CSV
output_path = '/content/dataset_categorization.csv'
category_explanations.to_csv(output_path)

# Print the path to the output CSV file
print(output_path)


       Dataset  Overall Model Specification Score             Category
2     HABERMAN                          84.000000  Very well specified
3  TRANSFUSION                          81.429672  Very well specified
1        ADULT                          42.000000              Average
0        MAGIC                          39.031935         Misspecified
/content/dataset_categorization.csv
