In [165]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import joblib
from functools import reduce
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
#from skmultilearn.model_selection import MultiLabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [166]:
feat_breed = pd.read_pickle('../features/breed.pkl')
feat_breed_v2 = pd.read_pickle('../features/breed_v2.pkl')

feat_breed_top50 = pd.read_pickle('../features/breed_top50.pkl')
feat_breed_top50_v2 = pd.read_pickle('../features/breed_top50_v2.pkl')

feat_breed_group = pd.read_pickle('../features/breed_group.pkl')
feat_sub_breed = pd.read_pickle('../features/sub_breed.pkl')
feat_breed_type = pd.read_pickle('../features/breed_type.pkl')

feat_breed_pure_or_mix = pd.read_pickle('../features/breed_pure_or_mix.pkl')

feat_age = pd.read_csv('../features/one_hot_encoded_age_with_id.csv')
feat_sex = pd.read_csv('../features/one_hot_encoded_sex_with_id.csv')

# Climate
feat_HotWheater = pd.read_csv('../features/one_hot_encoded_HotWheater_with_id.csv')
feat_ModerateWheather = pd.read_csv('../features/one_hot_encoded_ModerateWheather_with_id.csv')
feat_ColdWheater_with_id = pd.read_csv('../features/one_hot_encoded_ColdWheater_with_id.csv')

# Physical Activity
feat_pa_total_hours = pd.read_csv('../features/PhysicalActivity_total_hours.csv')
feat_pa_surface = pd.read_csv('../features/PhysicalActivity_surface.csv')

# Owner Demographics
feat_od_income = pd.read_csv('../features/od_income.csv')

feat_disease_input = pd.read_csv('../features/one_hot_encoded_disease_input.csv')
feat_disease_output_binary = pd.read_csv('../features/disease_output_binary.csv')
feat_disease_output = pd.read_csv('../features/disease_output.csv')


features_list = [
    #feat_breed,
    #feat_breed_v2,
    #feat_breed_top50,
    feat_breed_top50_v2,
    feat_breed_group,
    feat_sub_breed,
    feat_breed_type,
    feat_breed_pure_or_mix,
    #feat_age,
    feat_sex,
    feat_HotWheater,
    feat_ModerateWheather,
    feat_ColdWheater_with_id,
    feat_pa_total_hours,
    feat_pa_surface,
    #feat_od_income,
    feat_disease_input,
    #feat_disease_output_binary,
    #feat_disease_output
]

In [167]:
exp1 = [feat_disease_input]
exp2 = [feat_breed_group]
exp3 = [feat_breed_top50_v2]
exp4 = [feat_disease_input, feat_breed_group]
exp5 = [feat_disease_input, feat_breed_group, feat_breed_top50_v2]



In [168]:
# Merge the two dataframes based on dog_id
#data = pd.merge(features_breed_group_v2, features_breed_top50_v2, on='dog_id')

# List of DataFrames to be merged
list_input_features = features_list  # Add more DataFrames as needed

# Merge DataFrames iteratively using reduce
input_features = reduce(lambda left, right: pd.merge(left, right, on='dog_id'), list_input_features)

# Merge the two dataframes based on dog_id
#data = pd.merge(feat_breed_top50_v2, feat_breed_group, on='dog_id')

# Merge with disease output feature
data = pd.merge(input_features, feat_disease_output_binary, on='dog_id')

In [169]:
# Assuming 'data' is your DataFrame
rows_with_nan = data[data.isna().any(axis=1)]
columns_with_nan = data.columns[data.isna().any()].tolist()
# Display the rows with NaN values
rows_with_nan[columns_with_nan]

In [170]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store AUC scores and sample counts
auc_scores_per_condition = {condition: [] for condition in y_columns}
sample_counts_per_condition = {condition: [] for condition in y_columns}

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    # Append AUC scores and sample counts to lists
    #all_auc_scores.extend(auc_scores)
    #for i, condition in enumerate(y_columns):
    #    auc_scores_per_condition[condition].append(auc_scores)

    for i, condition in enumerate(y_columns):
        sample_count = y_val[condition].sum()  # Count of positive samples for the condition
        sample_counts_per_condition[condition].append(sample_count)

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")
        auc_scores_per_condition[y_val.columns[i-1]].append(auc_score)

# Calculate average AUC scores
#weighted_average_auc_overall = sum(score * count for score, count in zip(all_auc_scores, sample_counts_per_condition['eye'])) / sum(sample_counts_per_condition['eye'])
#weighted_average_auc_per_condition = {condition: sum(score * count for score, count in zip(auc_scores, sample_counts)) / sum(sample_counts) for condition, (auc_scores, sample_counts) in zip(y_columns, sample_counts_per_condition.items())}

# Print average AUC scores
#print("\nWeighted Average AUC Scores:")
#print(f"Overall: {weighted_average_auc_overall}")
#for condition, weighted_avg_auc in weighted_average_auc_per_condition.items():
#    print(f"{condition}: {weighted_avg_auc}")


Fold 1 AUC Scores for Diseases:
hs_health_conditions_eye: 0.7404535385850413
hs_health_conditions_ear: 0.8602224816554944
hs_health_conditions_oral: 0.9724062841725793
hs_health_conditions_skin: 0.8127957781440865
hs_health_conditions_cardiac: 0.9072509861150768
hs_health_conditions_respiratory: 0.7314638548922003
hs_health_conditions_gastrointestinal: 0.6454114343568014
hs_health_conditions_liver: 0.7001404953277136
hs_health_conditions_kidney: 0.6810318899663764
hs_health_conditions_reproductive: 0.6558619309346758
hs_health_conditions_orthopedic: 0.8308634563385132
hs_health_conditions_neurological: 0.7155048019420741
hs_health_conditions_endocrine: 0.7012883436941005
hs_health_conditions_hematologic: 0.6040785776190352
hs_health_conditions_immune: 0.6047640104263637
hs_health_conditions_infectious_disease: 0.7536497131021691
hs_health_conditions_toxin_consumption: 0.7588896623918888
hs_health_conditions_trauma: 0.9099180323451588
hs_health_conditions_cancer: 0.6874404276558462

Fo

In [171]:
# Calculate average AUC per condition
average_auc_per_condition = {
    condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_per_condition.items()
}
# Calculate overall average AUC weighted by the number of samples
overall_average_auc_sum = 0
for auc_scores, sample_counts in zip(auc_scores_per_condition.values(), sample_counts_per_condition.values()):
    weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
    overall_average_auc_sum += weighted_sum / sum(sample_counts)

overall_average_auc = overall_average_auc_sum / len(auc_scores_per_condition)  # divide by the number of conditions

## AUC Scores

In [177]:
# Average AUC-score per condition
print("\n Average AUC Score per condition:")
for condition, avg_auc in average_auc_per_condition.items():
    print(f"{condition}: {avg_auc}")


 Average AUC Score per condition:
hs_health_conditions_eye: 0.7479442933194606
hs_health_conditions_ear: 0.8578817162379944
hs_health_conditions_oral: 0.9737283798478247
hs_health_conditions_skin: 0.8172674122386594
hs_health_conditions_cardiac: 0.9157908637986839
hs_health_conditions_respiratory: 0.7209108373985066
hs_health_conditions_gastrointestinal: 0.6460920878381924
hs_health_conditions_liver: 0.700859177114404
hs_health_conditions_kidney: 0.6817437528816324
hs_health_conditions_reproductive: 0.6443080535639826
hs_health_conditions_orthopedic: 0.8208160870820228
hs_health_conditions_neurological: 0.6804451720678911
hs_health_conditions_endocrine: 0.6703249155077425
hs_health_conditions_hematologic: 0.6402847300645184
hs_health_conditions_immune: 0.6182229265433373
hs_health_conditions_infectious_disease: 0.760101937824353
hs_health_conditions_toxin_consumption: 0.7500439744601305
hs_health_conditions_trauma: 0.9051799074374796
hs_health_conditions_cancer: 0.7001130427227402


In [182]:
# Get average AUC values as a list
average_auc_list = list(average_auc_per_condition.values())

# Calculate standard deviation of average AUC values
overall_auc_std = np.std(average_auc_list)

# Print average AUC scores
print("\nWeighted Average AUC Score:")
#print(f"Overall: {overall_average_auc}")
print(f"{overall_average_auc * 100:.2f}% ± {overall_auc_std * 100:.2f}%")


Weighted Average AUC Score:
75.01% ± 10.15%


# Archiv

In [70]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Train the model
ovr_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_proba = ovr_classifier.predict_proba(X_val)

# Calculate the AUC score for each disease
auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_columns)]

print("AUC Scores for Diseases:")
for i, auc_score in enumerate(auc_scores, start=1):
    print(f"{y_columns[i-1]}: {auc_score}")

AUC Scores for Diseases:
hs_health_conditions_eye: 0.608296178184405
hs_health_conditions_ear: 0.602082745434546
hs_health_conditions_oral: 0.673323984807287
hs_health_conditions_skin: 0.5737486832374212
hs_health_conditions_cardiac: 0.6757555014704391
hs_health_conditions_respiratory: 0.6597351834286939
hs_health_conditions_gastrointestinal: 0.5443517991312358
hs_health_conditions_liver: 0.6564524030823188
hs_health_conditions_kidney: 0.6286350350740452
hs_health_conditions_reproductive: 0.6277588549460594
hs_health_conditions_orthopedic: 0.5766742123729287
hs_health_conditions_neurological: 0.5726241353246092
hs_health_conditions_endocrine: 0.6105098605098606
hs_health_conditions_hematologic: 0.551430531692185
hs_health_conditions_immune: 0.6201361546965611
hs_health_conditions_infectious_disease: 0.5760720614902737
hs_health_conditions_toxin_consumption: 0.5670980079825317
hs_health_conditions_trauma: 0.6000221375634803
hs_health_conditions_cancer: 0.6025861321497534


In [71]:
model_filepath = '../models/naive_base_with_breedtop50.joblib'
joblib.dump(ovr_classifier, model_filepath)

['../models/naive_base_with_breedtop50.joblib']

In [72]:
loaded_model_filepath = '../models/naive_base_with_breedtop50.joblib'
loaded_ovr_classifier = joblib.load(loaded_model_filepath)

In [None]:
# Assuming X_val is your validation set DataFrame
# Select a random row (Datapoint) from X_val
random_index = np.random.choice(X_val.index)

In [73]:
random_data_point = X_val.loc[random_index]
random_data_point

breeds_American Pitbull Terrier              0
breeds_American Staffordshire Terrier        0
breeds_Australian Cattle Dog                 0
breeds_Australian Shepherd                   0
breeds_Basset Hound                          0
                                         ...  
other_hard                                True
grass_dirt                                True
gravel                                   False
sand                                      True
astroturf                                False
Name: 18995, Length: 126, dtype: object

In [74]:

# Reshape the data point to a 2D array
random_data_point_reshaped = random_data_point.values.reshape(1, -1)

# Make predictions on the reshaped data point using the loaded model
new_data_predictions_proba = loaded_ovr_classifier.predict_proba(random_data_point_reshaped)
new_data_predictions_proba



array([[6.71306308e-01, 4.46896673e-01, 9.48417250e-01, 3.22899883e-01,
        5.45953211e-01, 5.36113476e-01, 2.81684923e-01, 2.39306906e-01,
        2.41839342e-01, 4.15624281e-04, 3.53040030e-01, 1.71186483e-01,
        1.14720288e-01, 2.66393278e-03, 3.92847558e-03, 1.27038346e-01,
        3.37030138e-01, 1.28058841e-01, 1.19044821e-01]])

In [4]:
# Merge the two dataframes based on dog_id
data = pd.merge(feat_breed_top50_v2, feat_disease_output_binary, on='dog_id')

# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]
y

Unnamed: 0,hs_health_conditions_eye,hs_health_conditions_ear,hs_health_conditions_oral,hs_health_conditions_skin,hs_health_conditions_cardiac,hs_health_conditions_respiratory,hs_health_conditions_gastrointestinal,hs_health_conditions_liver,hs_health_conditions_kidney,hs_health_conditions_reproductive,hs_health_conditions_orthopedic,hs_health_conditions_neurological,hs_health_conditions_endocrine,hs_health_conditions_hematologic,hs_health_conditions_immune,hs_health_conditions_infectious_disease,hs_health_conditions_toxin_consumption,hs_health_conditions_trauma,hs_health_conditions_cancer
0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33167,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
33168,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33169,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
33170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [18]:
from sklearn.utils.validation import *
y_2 = check_array(y, input_name="y", ensure_2d=False, dtype=None)
if y_2.ndim == 2:
    # for multi-label y, map each distinct row to a string repr
    # using join because str(row) uses an ellipsis if len(row) > 1000
    y_2 = np.array([" ".join(row.astype("str")) for row in y_2])
print('line 144 y.shape: ', y.shape)

classes, y_indices = np.unique(y_2, return_inverse=True)
#print(classes)
n_classes = classes.shape[0]

class_counts = np.bincount(y_indices)
np.min(class_counts) < 2
np.bincount(y_indices)
y_indices

line 144 y.shape:  (33172, 19)


array([ 590,  546, 2513, ...,    8,    8, 2631])

In [20]:
# Check if the NumPy array and DataFrame are the same
if not np.array_equal(y_2, y.values):
    print("NumPy array and DataFrame are different")

    # Show the differences
    differences = y_2 - y.values
    print("Differences:")
    print(differences)
else:
    print("NumPy array and DataFrame are the same")

NumPy array and DataFrame are the same


In [7]:
list_sum = []
for column in y.columns:
    ones = y[column].sum()
    list_sum.append(ones)
list_sum

[4409,
 4306,
 9161,
 9512,
 1947,
 1170,
 4752,
 1154,
 2586,
 777,
 6329,
 1606,
 1098,
 182,
 270,
 8854,
 3645,
 9195,
 2101]

In [5]:

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Use the split method to get the train and test indices
for train_index, test_index in sss.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Oversample each disease separately
X_resampled_list = []
y_resampled_list = []

ros = RandomOverSampler(random_state=42)

for column in y_train.columns:
    X_resampled_disease, y_resampled_disease = ros.fit_resample(X_train, y_train[column])
    X_resampled_list.append(X_resampled_disease)
    y_resampled_list.append(y_resampled_disease)

X_resampled = pd.concat(X_resampled_list, axis=1)
y_resampled = pd.concat(y_resampled_list, axis=1)

# Train the model
ovr_classifier.fit(X_resampled, y_resampled)

# Make predictions on the validation set
y_pred_proba = ovr_classifier.predict_proba(X_val)

# Calculate the AUC score for each disease
auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

print("AUC Scores for Diseases:")
for i, auc_score in enumerate(auc_scores, start=1):
    print(f"{y_val.columns[i-1]}: {auc_score}")

y.shape: (33172, 19)
y.shape:  (33172, 19)
y.shape:  (33172, 19)
line 2132: y.shape:  (33172, 19)
line 144 y.shape:  (33172,)
y: 
 ['0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0'
 '0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0' ...
 '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 1 1 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0']
y_indices: 
 [ 590  546 2513 ...    8    8 2631]


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
#for (i, auc_score), (j, auc_score_2) in zip(enumerate(auc_scores, start=1), enumerate(auc_scores_2, start=1)):
#    difference = auc_score - auc_score_2
#    print(f"{y_columns[i-1]}: auc_score={auc_score} | auc_score_2={auc_score_2} | Difference={difference}")

In [122]:
# Merge the two dataframes based on dog_id
#data = pd.merge(features_breed_group_v2, features_breed_top50_v2, on='dog_id')

# List of DataFrames to be merged
list_input_features = features_list  # Add more DataFrames as needed

# Merge DataFrames iteratively using reduce
input_features = reduce(lambda left, right: pd.merge(left, right, on='dog_id'), list_input_features)

# Merge with disease output feature
data = pd.merge(input_features, feat_disease_output_binary, on='dog_id')

In [118]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Iterate through the diseases
for disease_column in y_columns:
    print(f"\nDisease: {disease_column}")

    # Get the target variable for the current disease
    y_current_disease = data[disease_column]

    # Initialize StratifiedKFold
    n_splits = 5  # You can adjust the number of splits as needed
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Iterate through the splits
    for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y_current_disease)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y_current_disease.iloc[train_index], y_current_disease.iloc[val_index]

        # Train the model
        ovr_classifier.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred_proba = ovr_classifier.predict_proba(X_val)[:, 1]  # Assuming you want the probability of the positive class

        # Calculate the AUC score for the disease
        auc_score = roc_auc_score(y_val, y_pred_proba)

        print(f"Fold {fold+1} AUC Score: {auc_score}")



Disease: hs_health_conditions_eye
Fold 1 AUC Score: 0.5889365274163414
Fold 2 AUC Score: 0.5923737349239456
Fold 3 AUC Score: 0.6015350683219536
Fold 4 AUC Score: 0.5955443072647536
Fold 5 AUC Score: 0.5991581377167428

Disease: hs_health_conditions_ear
Fold 1 AUC Score: 0.585866429255166
Fold 2 AUC Score: 0.60439846604382
Fold 3 AUC Score: 0.6089949136260728
Fold 4 AUC Score: 0.5926083200626733
Fold 5 AUC Score: 0.6039063132234951

Disease: hs_health_conditions_oral
Fold 1 AUC Score: 0.6738195759463118
Fold 2 AUC Score: 0.6648677406015212
Fold 3 AUC Score: 0.664522331530314
Fold 4 AUC Score: 0.6603683788159281
Fold 5 AUC Score: 0.6505481913781567

Disease: hs_health_conditions_skin
Fold 1 AUC Score: 0.567172297039507
Fold 2 AUC Score: 0.5822176090054009
Fold 3 AUC Score: 0.5702938451207149
Fold 4 AUC Score: 0.5612502340687936
Fold 5 AUC Score: 0.5644397845921392

Disease: hs_health_conditions_cardiac
Fold 1 AUC Score: 0.6948982902828138
Fold 2 AUC Score: 0.6455586360658586
Fold 3 AUC

In [119]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")



Fold 1 AUC Scores for Diseases:
hs_health_conditions_eye: 0.5911374057345359
hs_health_conditions_ear: 0.5975891893278246
hs_health_conditions_oral: 0.6636215962728014
hs_health_conditions_skin: 0.5765633374012875
hs_health_conditions_cardiac: 0.659720136041126
hs_health_conditions_respiratory: 0.657287714699593
hs_health_conditions_gastrointestinal: 0.5506340041052006
hs_health_conditions_liver: 0.6392620489724077
hs_health_conditions_kidney: 0.6529590641141956
hs_health_conditions_reproductive: 0.658563047780605
hs_health_conditions_orthopedic: 0.5990176682539723
hs_health_conditions_neurological: 0.5318584877863988
hs_health_conditions_endocrine: 0.5954687021708894
hs_health_conditions_hematologic: 0.5497067448680352
hs_health_conditions_immune: 0.6451882134632272
hs_health_conditions_infectious_disease: 0.5702841001926019
hs_health_conditions_toxin_consumption: 0.5470477110827017
hs_health_conditions_trauma: 0.605545182331868
hs_health_conditions_cancer: 0.6086773006966928

Fold 2

In [120]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store AUC scores
all_auc_scores = []
health_condition_auc_scores = {condition: [] for condition in y_columns}

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    # Append AUC scores to lists
    all_auc_scores.extend(auc_scores)
    for i, condition in enumerate(y_columns):
        health_condition_auc_scores[condition].append(auc_scores[i])

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")

# Calculate average AUC scores
average_auc_overall = sum(all_auc_scores) / len(all_auc_scores)
average_auc_per_condition = {condition: sum(scores) / len(scores) for condition, scores in health_condition_auc_scores.items()}

# Print average AUC scores
print("\nAverage AUC Scores:")
print(f"Overall: {average_auc_overall}")
for condition, avg_auc in average_auc_per_condition.items():
    print(f"{condition}: {avg_auc}")


Fold 1 AUC Scores for Diseases:
hs_health_conditions_eye: 0.5911374057345359
hs_health_conditions_ear: 0.5975891893278246
hs_health_conditions_oral: 0.6636215962728014
hs_health_conditions_skin: 0.5765633374012875
hs_health_conditions_cardiac: 0.659720136041126
hs_health_conditions_respiratory: 0.657287714699593
hs_health_conditions_gastrointestinal: 0.5506340041052006
hs_health_conditions_liver: 0.6392620489724077
hs_health_conditions_kidney: 0.6529590641141956
hs_health_conditions_reproductive: 0.658563047780605
hs_health_conditions_orthopedic: 0.5990176682539723
hs_health_conditions_neurological: 0.5318584877863988
hs_health_conditions_endocrine: 0.5954687021708894
hs_health_conditions_hematologic: 0.5497067448680352
hs_health_conditions_immune: 0.6451882134632272
hs_health_conditions_infectious_disease: 0.5702841001926019
hs_health_conditions_toxin_consumption: 0.5470477110827017
hs_health_conditions_trauma: 0.605545182331868
hs_health_conditions_cancer: 0.6086773006966928

Fold 2