In [88]:
import os
import numpy as np
import pandas as pd
import joblib
from functools import reduce
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import csv

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
#from skmultilearn.model_selection import MultiLabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [89]:
# breed
feat_breed = pd.read_pickle('../features/breed.pkl')
feat_breed_v2 = pd.read_pickle('../features/breed_v2.pkl')

feat_breed_top50 = pd.read_pickle('../features/breed_top50.pkl')
feat_breed_top50_v2 = pd.read_pickle('../features/breed_top50_v2.pkl')

feat_breed_group = pd.read_pickle('../features/breed_group.pkl')
feat_sub_breed = pd.read_pickle('../features/sub_breed.pkl')
feat_breed_type = pd.read_pickle('../features/breed_type.pkl')

feat_breed_pure_or_mix = pd.read_pickle('../features/breed_pure_or_mix.pkl')

# age
feat_age = pd.read_csv('../features/age_with_id.csv')

# sex
feat_sex = pd.read_csv('../features/one_hot_encoded_sex_with_id.csv')

# Climate
# feat_HotWheater = pd.read_csv('../features/one_hot_encoded_HotWheater_with_id.csv')
# feat_ModerateWheather = pd.read_csv('../features/one_hot_encoded_ModerateWheather_with_id.csv')
# feat_ColdWheater_with_id = pd.read_csv('../features/one_hot_encoded_ColdWheater_with_id.csv')
feat_cold_month = pd.read_pickle('../features/ColdMonths.pkl')
feat_hot_month = pd.read_pickle('../features/HotMonths.pkl')

# Diet
feat_diet = pd.read_pickle('../features/diet.pkl')

# Physical Activity
feat_pa_total_hours = pd.read_csv('../features/PhysicalActivity_total_hours.csv')
feat_pa_surface = pd.read_csv('../features/PhysicalActivity_surface.csv')
feat_pa_wheather = pd.read_csv('../features/PhysicalActivity_wheather.csv')

# Owner Demographics
feat_od_income = pd.read_pickle('../features/od_income.pkl')

# Residentual
feat_prim_census_division = pd.read_pickle('../features/primary_residence_census_division.pkl')

# disease
#feat_disease_input = pd.read_csv('../features/one_hot_encoded_disease_input.csv')
feat_disease_output_binary = pd.read_csv('../features/disease_output_binary.csv')
feat_disease_output = pd.read_csv('../features/disease_output.csv')

# age_condition
feat_age_condition = pd.read_pickle('../features/age_condition.pkl')
feat_age_condition_type = pd.read_pickle('../features/age_condition_type.pkl')


features_list = [

    # breed
    #feat_breed,
    #feat_breed_v2,
    #feat_breed_top50,
    feat_breed_top50_v2,
    feat_breed_group,
    feat_sub_breed,
    feat_breed_type,
    feat_breed_pure_or_mix,

    # age
    feat_age,   # 24881 dog_ids

    # sex
    feat_sex,

    # diet
    feat_diet,  # 33141 dog_ids for df_diet_consistency

    # Old climate
    #feat_HotWheater,
    #feat_ModerateWheather,
    #feat_ColdWheater_with_id,

    # Climate
    feat_hot_month,
    feat_cold_month,

    # Physical Activity
    feat_pa_surface,
    feat_pa_total_hours,
    feat_pa_wheather,   # 26406 dog_ids

    # owner demographic
    feat_od_income,    # 29096 dog_ids

    # Residentual
    feat_prim_census_division,

    # disease
    #feat_disease_input,
    #feat_disease_output_binary,
    #feat_disease_output
]

In [94]:
len(features_list)

15

In [98]:
# Merge the two dataframes based on dog_id
#data = pd.merge(features_breed_group_v2, features_breed_top50_v2, on='dog_id')

# Initialize a list to store overall average AUC scores
overall_auc_results = []

for i, features in enumerate(features_list):
    # List of DataFrames to be merged
    list_input_features = [features_list[i]]  # Add more DataFrames as needed

    feature_names = [var_name for var_name, var_value in locals().items() if var_value is features and var_name.startswith('feat_')]
    print(feature_names)

    # Merge DataFrames iteratively using reduce
    input_features = reduce(lambda left, right: pd.merge(left, right, on='dog_id'), list_input_features)

    # Merge the two dataframes based on dog_id
    #data = pd.merge(feat_breed_top50_v2, feat_breed_group, on='dog_id')

    # Merge with disease output feature
    data = pd.merge(input_features, feat_disease_output_binary, on='dog_id')

    # Normalize the specified features in one line and save in the same columns
    features_to_normalize = ['age_diagnosis_years', 'df_diet_consistency', 'df_feedings_per_day', 'total_active_hours_y', 'pa_hot_weather_months_per_year', 'pa_cold_weather_months_per_year', 'total_active_hours_y', 'od_annual_income_range_usd']

    # select features that are in data
    features_to_normalize_in_data = [feature for feature in features_to_normalize if feature in data.columns]

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Normalize
    if features_to_normalize_in_data:
        data[features_to_normalize_in_data] = scaler.fit_transform(data[features_to_normalize_in_data])
        #print(data[features_to_normalize_in_data])


    # Separate features and labels
    X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
        'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
        'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
        'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
    ]], axis=1)

    y_columns = ['hs_health_conditions_' + condition for condition in [
        'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
        'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
        'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
    ]]
    y = data[y_columns]

    # Convert y to a binary format
    y_binary = (y == 1)

    # Initialize the Naive Bayes model
    nb_model = MultinomialNB()

    # Wrap the model with OneVsRestClassifier
    ovr_classifier = OneVsRestClassifier(nb_model)

    # Initialize MultilabelStratifiedKFold
    n_splits = 5  # You can adjust the number of splits as needed
    ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Lists to store AUC scores and sample counts
    auc_scores_per_condition = {condition: [] for condition in y_columns}
    sample_counts_per_condition = {condition: [] for condition in y_columns}

    # Lists to store AUC scores and sample counts per fold
    auc_scores_per_fold = {f'Fold_{i+1}': [] for i in range(n_splits)}

    # Iterate through the splits
    for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        #print('X_train: ', X_train.shape, '  X_val:', X_val.shape)
        y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

        # Train the model
        ovr_classifier.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred_proba = ovr_classifier.predict_proba(X_val)

        # Calculate the AUC score for each disease
        auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

        # Append AUC scores and sample counts to lists
        #all_auc_scores.extend(auc_scores)
        #for i, condition in enumerate(y_columns):
        #    auc_scores_per_condition[condition].append(auc_scores)

        # Calculate the AUC score for each fold
        fold_auc = roc_auc_score(y_val.values.ravel(), y_pred_proba.ravel())

        auc_scores_per_fold[f'Fold_{fold+1}'].append(fold_auc)

        for i, condition in enumerate(y_columns):
            sample_count = y_val[condition].sum()  # Count of positive samples for the condition
            sample_counts_per_condition[condition].append(sample_count)

        #print(f"\nFold {fold+1} AUC Scores for Diseases:")
        for i, auc_score in enumerate(auc_scores, start=1):
            #print(f"{y_val.columns[i-1]}: {auc_score}")
            auc_scores_per_condition[y_val.columns[i-1]].append(auc_score)


    average_auc_per_condition = {
        condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_per_condition.items()
    }
    # Calculate overall average AUC weighted by the number of samples
    overall_average_auc_sum = 0
    for auc_scores, sample_counts in zip(auc_scores_per_condition.values(), sample_counts_per_condition.values()):
        weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
        overall_average_auc_sum += weighted_sum / sum(sample_counts)

    overall_average_auc = overall_average_auc_sum / len(auc_scores_per_condition)  # divide by the number of conditions
    # Average AUC-score per condition
    print("\n Average AUC Score per condition:")
    for condition, avg_auc in average_auc_per_condition.items():
        print(f"{condition}: {avg_auc}")

    # Print average AUC scores per fold
    print("\nAverage AUC Scores per Fold:")
    for fold, avg_auc in auc_scores_per_fold.items():
        print(f"Fold {fold}: {avg_auc}")

    # Get average AUC values as a list
    average_auc_list = list(average_auc_per_condition.values())

    # Calculate standard deviation of average AUC values
    overall_auc_std = np.std(average_auc_list)

    # Print average AUC scores
    print("\nWeighted Average AUC Score:")
    #print(f"Overall: {overall_average_auc}")
    print(f"{overall_average_auc * 100:.2f}%")
    print('===============================================\n')

    # Append the overall average AUC score and corresponding feature names to the list
    overall_auc_results.append({
        'Features': ', '.join(feature_names),
        'Overall Average AUC': overall_average_auc * 100
    })


# Save the overall average AUC scores and corresponding feature names to a CSV file
output_file_path = 'overall_average_auc_results.csv'
with open(output_file_path, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Features', 'Overall Average AUC'])
    writer.writeheader()
    writer.writerows(overall_auc_results)

print(f"\nOverall Average AUC Scores saved to {output_file_path}")

['feat_breed_top50_v2']

 Average AUC Score per condition:
hs_health_conditions_eye: 0.5814961589996709
hs_health_conditions_ear: 0.5853358463288293
hs_health_conditions_oral: 0.6418923034427354
hs_health_conditions_skin: 0.5537811674440208
hs_health_conditions_cardiac: 0.6664620336965638
hs_health_conditions_respiratory: 0.6418506351191955
hs_health_conditions_gastrointestinal: 0.5404492005672783
hs_health_conditions_liver: 0.6142479880182784
hs_health_conditions_kidney: 0.5491285602606133
hs_health_conditions_reproductive: 0.5820749299525023
hs_health_conditions_orthopedic: 0.5577331811022267
hs_health_conditions_neurological: 0.5238306634290589
hs_health_conditions_endocrine: 0.5473806512970738
hs_health_conditions_hematologic: 0.5366997249989633
hs_health_conditions_immune: 0.5526231780763766
hs_health_conditions_infectious_disease: 0.5599102968931806
hs_health_conditions_toxin_consumption: 0.5453007096162648
hs_health_conditions_trauma: 0.5825737840494739
hs_health_conditions_canc

In [87]:
for i, features in enumerate(features_list):
    # Extract feature names from variable names in the features_list
    feature_names = [var_name for var_name, var_value in locals().items() if var_value is features and var_name.startswith('feat_')]

    print(feature_names)

['feat_breed_top50_v2']
['feat_breed_group']
['feat_sub_breed']
['feat_breed_type']
['feat_breed_pure_or_mix']
['feat_age']
['feat_sex']
['feat_diet']
['feat_hot_month']
['feat_cold_month']
['feat_pa_surface']
['feat_pa_total_hours']
['feat_pa_wheather']
['feat_od_income']
['feat_prim_census_division']
