Average datasets

##Calibrated actual datasets

In [9]:
import pandas as pd
import numpy as np

# Function to normalize based on category bounds
def normalize_category(value, bounds):
    if value >= bounds[0]:
        return 1.0
    elif value <= bounds[-1]:
        return 0.0
    for i, bound in enumerate(bounds):
        if value > bound:
            return (len(bounds) - 1 - i) / (len(bounds) - 1)
    return 0.0

# Define category bounds for the Hosmer-Lemeshow and Link Test P-values, and Pseudo R-squared
hosmer_lemeshow_bounds = [0.05, 0.01, 0.001, 0.0001, 0]
link_test_bounds = [0.05, 0.01, 0.001, 0.0001, 0]
pseudo_r_squared_bounds = [0.4, 0.3, 0.2, 0.1, 0]

# Updated data dictionary with pseudo R-squared and link test p-value for each dataset
data = {
    'Dataset': ['Credit Card Fraud 2', 'Vehicle Insurance Fraud', 'Credit Card Fraud 1',
                'E-commerce Customer Churn Prediction', 'Bank Customer Churn Prediction',
                'Telecom Customer Churn Prediction'],
    'Hosmer-Lemeshow P-value': [0.0, 0.7971637749060193, 0.0, 2.9949334678258666e-05,
                                0.0023718870447108076, 0.026343609988660255],
    'BIC': [12969.707112923728, 4893.057, 21340.899, 2358.939, 7009.424, 4157.111],
    'Pseudo R-squared': [0.7159, 0.1565, 0.5552, 0.6282, 0.1508,  0.4102],
    'Link Test P-value': [1.000, 1.219e-24, 0.0, 5.485e-160, 7.398e-64, 1.502e-113]
}

# Create DataFrame
df = pd.DataFrame(data)

# Normalize the metrics
df['Normalized Hosmer-Lemeshow'] = df['Hosmer-Lemeshow P-value'].apply(normalize_category, bounds=hosmer_lemeshow_bounds)
df['Normalized Link Test'] = df['Link Test P-value'].apply(normalize_category, bounds=link_test_bounds)
df['Normalized Pseudo R-squared'] = df['Pseudo R-squared'].apply(lambda x: normalize_category(x, pseudo_r_squared_bounds) if x > 0 else 0)
df['Normalized BIC'] = 1 - (df['BIC'] - df['BIC'].min()) / (df['BIC'].max() - df['BIC'].min())

# Weights as specified, ensuring correct order
weights = np.array([1, 0.9, 0.4, 0.2])  # Adjusted to match columns correctly
total_weight = weights.sum()

# Calculate the overall Model Specification Score
df['Overall Model Specification Score'] = (df[
    ['Normalized Hosmer-Lemeshow', 'Normalized BIC', 'Normalized Pseudo R-squared', 'Normalized Link Test']
].dot(weights) / total_weight) * 100

df.sort_values(by='Overall Model Specification Score', ascending=False, inplace=True)

# Define the categories based on the scores
def categorize(score):
    if score > 80:
        return 'Very well specified'
    elif 60 < score <= 80:
        return 'Well Specified'
    elif 40 < score <= 60:
        return 'Average'
    elif 20 < score <= 40:
        return 'Misspecified'
    else:
        return 'Badly misspecified'

# Apply categorization to the DataFrame
df['Category'] = df['Overall Model Specification Score'].apply(categorize)
print(df[['Dataset', 'Overall Model Specification Score', 'Category']])

# Print DataFrame for overview
print(df)
# Save the DataFrame to a CSV
output_path_0 = '/content/short_overview_datasets_ranking.csv'
df.to_csv(output_path_0)

# Print the path to the output CSV file
print(output_path_0)

explanations = {
    'Credit Card Fraud 2': [
        'The Hosmer-Lemeshow test value of 0.0 indicates extremely poor model calibration.',
        'A BIC value of 12,969.71 suggests significant inefficiency and likely model overfitting.',
        'A relatively high value of 0.7159 suggests that the model has strong explanatory power despite other indicators of poor specification.',
        'The Link Test p-value of 1.000 implies a correctly specified model under this metric alone, contradicting other indicators.'
    ],
    'Vehicle Insurance Fraud': [
        'A value of 0.797 suggests good model fit and calibration across different deciles.',
        'The BIC value of 4,893.057 indicates a balance between model fit and complexity.',
        'A low value of 0.1565 suggests the model explains only a small variance of the dependent variable.',
        'The extremely low Link Test p-value (1.219e-24) indicates potential misspecification issues.'
    ],
    'Credit Card Fraud 1': [
        'A p-value of 0.0 signals a poor fit, showing significant discrepancies between expected and observed values.',
        'A very high BIC of 21,340.899 suggests considerable model inefficiency and probable overfitting.',
        'The model"s Pseudo R-squared of 0.5552 indicates moderate explanatory power.',
        'A Link Test p-value of 0.0 indicates the model"s functional form may be misspecified.'
    ],
    'E-commerce Customer Churn Prediction': [
        'A very low value (2.994e-05) indicates a poor fit.',
        'A relatively low BIC of 2,358.939 suggests a reasonable balance between simplicity and fit.',
        'The Pseudo R-squared value of 0.6282 suggests a relatively good explanation of variance by the model.',
        'The extremely low p-value (5.485e-160) indicates potential issues with the model"s functional form.'
    ],
    'Bank Customer Churn Prediction': [
        'A p-value of 0.0024 suggests discrepancies in model calibration.',
        'The BIC value of 7,009.424 indicates potential for improvement in model fit or complexity.',
        'A Pseudo R-squared of 0.1508 shows limited model effectiveness in explaining the variance.',
        'The p-value of 7.398e-64 points to potential misspecification.'
    ],
    'Telecom Customer Churn Prediction': [
        'A value of 0.0263 suggests poor fit with significant calibration issues.',
        'A BIC value of 4,157.111 points to a reasonable fit with some room for improvement.',
        'A Pseudo R-squared of 0.4102 indicates adequate explanatory power.',
        'The low Link Test p-value (1.502e-113) suggests potential issues with the model"s functional form.'
    ]
}



# Prepare a DataFrame to hold categorizations, Model Specification Scores, and explanations
category_explanations = pd.DataFrame(
    columns=['Model Specification Score', 'Badly misspecified', 'Misspecified', 'Average',
             'Well Specified', 'Very well specified'],
    index=df['Dataset']
)

# Insert the 'Model Specification Score' from df to category_explanations
category_explanations['Model Specification Score'] = df['Overall Model Specification Score'].values

# Format the 'Model Specification Score' to display only two decimal places
category_explanations['Model Specification Score'] = category_explanations['Model Specification Score'].map(lambda x: f"{x:.2f}")

# Map each dataset to its explanation
for dataset in df['Dataset']:
    category = df[df['Dataset'] == dataset]['Category'].values[0]
    explanations_list = explanations[dataset]  # Corrected to use the dictionary
    category_explanations.at[dataset, category] = "\n".join(explanations_list)

# Save the DataFrame to a CSV
output_path = '/content/dataset_categorization.csv'
category_explanations.to_csv(output_path)

# Print the path to the output CSV file
print(output_path)





                                Dataset  Overall Model Specification Score  \
5     Telecom Customer Churn Prediction                          78.589699   
1               Vehicle Insurance Fraud                          75.193950   
3  E-commerce Customer Churn Prediction                          52.000000   
4        Bank Customer Churn Prediction                          51.180181   
0                   Credit Card Fraud 2                          39.876280   
2                   Credit Card Fraud 1                          16.000000   

             Category  
5      Well Specified  
1      Well Specified  
3             Average  
4             Average  
0        Misspecified  
2  Badly misspecified  
                                Dataset  Hosmer-Lemeshow P-value  \
5     Telecom Customer Churn Prediction                 0.026344   
1               Vehicle Insurance Fraud                 0.797164   
3  E-commerce Customer Churn Prediction                 0.000030   
4        Bank

##Actual datasets