In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from functools import reduce
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
#from skmultilearn.model_selection import MultiLabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

In [2]:
# breed
feat_breed = pd.read_pickle('../features/breed.pkl')
feat_breed_v2 = pd.read_pickle('../features/breed_v2.pkl')

feat_breed_top50 = pd.read_pickle('../features/breed_top50.pkl')
feat_breed_top50_v2 = pd.read_pickle('../features/breed_top50_v2.pkl')

feat_breed_group = pd.read_pickle('../features/breed_group.pkl')
feat_sub_breed = pd.read_pickle('../features/sub_breed.pkl')
feat_breed_type = pd.read_pickle('../features/breed_type.pkl')

feat_breed_pure_or_mix = pd.read_pickle('../features/breed_pure_or_mix.pkl')

# age
feat_age = pd.read_csv('../features/age_with_id.csv')

# sex
feat_sex = pd.read_csv('../features/one_hot_encoded_sex_with_id.csv')

# weight
feat_weight = pd.read_pickle('../features/weight.pkl')

# Climate
# feat_HotWheater = pd.read_csv('../features/one_hot_encoded_HotWheater_with_id.csv')
# feat_ModerateWheather = pd.read_csv('../features/one_hot_encoded_ModerateWheather_with_id.csv')
# feat_ColdWheater_with_id = pd.read_csv('../features/one_hot_encoded_ColdWheater_with_id.csv')
feat_cold_month = pd.read_pickle('../features/ColdMonths.pkl')
feat_hot_month = pd.read_pickle('../features/HotMonths.pkl')

# Diet
feat_diet = pd.read_pickle('../features/diet.pkl') # 5 columns

# Physical Activity
feat_pa_total_hours = pd.read_csv('../features/PhysicalActivity_total_hours.csv')
feat_pa_surface = pd.read_csv('../features/PhysicalActivity_surface.csv')
feat_pa_wheather = pd.read_csv('../features/PhysicalActivity_wheather.csv')

# Owner Demographics
feat_od_income = pd.read_pickle('../features/od_income.pkl')
feat_od_education = pd.read_pickle('../features/od_education.pkl')

# Residentual
feat_prim_census_division = pd.read_pickle('../features/primary_residence_census_division.pkl')

# disease
#feat_disease_input = pd.read_csv('../features/one_hot_encoded_disease_input.csv')
feat_disease_output_binary = pd.read_csv('../features/disease_output_binary.csv')
feat_disease_output = pd.read_csv('../features/disease_output.csv')

# age_condition
feat_age_condition = pd.read_pickle('../features/age_condition.pkl')
feat_age_condition_type = pd.read_pickle('../features/age_condition_type.pkl')


features_list = [

    # breed
    #feat_breed,
    #feat_breed_v2,
    #feat_breed_top50,
    #feat_breed_top50_v2,
    #feat_breed_group,
    #feat_sub_breed,
    #feat_breed_type,
    #feat_breed_pure_or_mix,

    # age
    #feat_age,   # 24881 dog_ids

    # sex
    #feat_sex,

    # weight
    #feat_weight

    # diet
    #feat_diet,  # 33141 dog_ids for df_diet_consistency

    #feat_HotWheater,
    #feat_ModerateWheather,
    #feat_ColdWheater_with_id,

    # Climate
    #feat_pa_surface,
    #feat_pa_total_hours,
    #feat_pa_wheather,   # 26406 dog_ids
    #feat_hot_month,
    #feat_cold_month,

    # Physical Activity
    #feat_pa_total_hours,
    #feat_pa_surface,

    # Owner demographics
    #feat_od_income,    # 29096 dog_ids
    #feat_od_education

    # Residentual
    feat_prim_census_division,

    # disease
    #feat_disease_input,
    #feat_disease_output_binary,
    #feat_disease_output
]

feat_age_condition_type

Unnamed: 0,dog_id,age_diagnosis_years,condition_type_Bone/Orthopedic,condition_type_Brain/Neurologic,condition_type_Cardiac,condition_type_Ear/Nose/Throat,condition_type_Endocrine,condition_type_Eye,condition_type_Gastrointestinal,condition_type_Hematopoietic,...,condition_type_Kidney/Urinary,condition_type_Liver/Pancreas,condition_type_Mouth/Dental/Oral,condition_type_Other Congenital Disorder,condition_type_Reproductive,condition_type_Respiratory,condition_type_Skin,condition_type_Toxin Consumption,condition_type_Trauma,condition_type_cancer
0,100001,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,100001,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,100001,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,100001,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,100001,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910915,3373,10,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
910929,14856,10,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
910936,58327,6,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
910945,3417,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [3]:
feat_breed_top50_v2[feat_breed_top50_v2['dog_id'] == 5074]

Unnamed: 0,dog_id,breeds_American Pitbull Terrier,breeds_American Staffordshire Terrier,breeds_Australian Cattle Dog,breeds_Australian Shepherd,breeds_Basset Hound,breeds_Beagle,breeds_Bernese Mountain Dog,breeds_Bichon Frise,breeds_Border Collie,...,breeds_Poodle (Toy),breeds_Pug,breeds_Rat Terrier,breeds_Rhodesian Ridgeback,breeds_Rottweiler,breeds_Shetland Sheepdog,breeds_Shih Tzu,breeds_Siberian Husky,breeds_West Highland White Terrier,breeds_Yorkshire Terrier
18252,5074,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
feat_od_income['od_annual_income_range_usd'].value_counts().sum()

29096

In [5]:
feat_hot_month

Unnamed: 0,dog_id,pa_hot_weather_months_per_year
0,10509,4
1,74227,2
2,32487,3
3,63150,0
4,33369,2
...,...,...
33167,91849,0
33168,33783,1
33169,98862,2
33170,99462,0


In [6]:
#exp1 = [feat_disease_input]
#exp2 = [feat_breed_group]
#exp3 = [feat_breed_top50_v2]
#exp4 = [feat_disease_input, feat_breed_group]
#exp5 = [feat_disease_input, feat_breed_group, feat_breed_top50_v2]



In [4]:
# List of DataFrames to be merged
list_input_features = features_list  # Add more DataFrames as needed

# Merge DataFrames iteratively using reduce
input_features = reduce(lambda left, right: pd.merge(left, right, on='dog_id'), list_input_features)

# Merge the two dataframes based on dog_id
#data = pd.merge(feat_breed_top50_v2, feat_breed_group, on='dog_id')

# Merge with disease output feature
data = pd.merge(feat_age_condition_type, input_features, on='dog_id')
data

Unnamed: 0,dog_id,age_diagnosis_years,condition_type_Bone/Orthopedic,condition_type_Brain/Neurologic,condition_type_Cardiac,condition_type_Ear/Nose/Throat,condition_type_Endocrine,condition_type_Eye,condition_type_Gastrointestinal,condition_type_Hematopoietic,...,condition_type_cancer,oc_primary_census_division__1.0,oc_primary_census_division__2.0,oc_primary_census_division__3.0,oc_primary_census_division__4.0,oc_primary_census_division__5.0,oc_primary_census_division__6.0,oc_primary_census_division__7.0,oc_primary_census_division__8.0,oc_primary_census_division__9.0
0,100001,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,100001,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,100001,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,100001,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,100001,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306085,77040,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306086,77040,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306087,77040,5,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306088,77040,6,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [8]:
data['dog_id'].value_counts()

dog_id
83756     44
64803     43
36023     43
9104      40
5074      39
          ..
103095     1
84096      1
79628      1
99055      1
95167      1
Name: count, Length: 26666, dtype: int64

In [9]:
# Check for NaN values
rows_with_nan = data[data.isna().any(axis=1)]
columns_with_nan = data.columns[data.isna().any()].tolist()
# Display the rows with NaN values
rows_with_nan[columns_with_nan]

In [10]:
# Normalize the specified features in one line and save in the same columns
features_to_normalize = ['age_diagnosis_years', 'df_diet_consistency', 'df_feedings_per_day', 'total_active_hours_y', 'pa_hot_weather_months_per_year', 'pa_cold_weather_months_per_year', 'total_active_hours_y', 'od_annual_income_range_usd']

# select features that are in data
features_to_normalize_in_data = [feature for feature in features_to_normalize if feature in data.columns]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Normalize
if features_to_normalize_in_data:
    data[features_to_normalize_in_data] = scaler.fit_transform(data[features_to_normalize_in_data])
    print(data[features_to_normalize_in_data])

        age_diagnosis_years
0                      0.00
1                      0.04
2                      0.08
3                      0.12
4                      0.16
...                     ...
306085                 0.12
306086                 0.16
306087                 0.20
306088                 0.24
306089                 0.28

[306090 rows x 1 columns]


In [11]:
data

Unnamed: 0,dog_id,age_diagnosis_years,condition_type_Bone/Orthopedic,condition_type_Brain/Neurologic,condition_type_Cardiac,condition_type_Ear/Nose/Throat,condition_type_Endocrine,condition_type_Eye,condition_type_Gastrointestinal,condition_type_Hematopoietic,...,condition_type_cancer,oc_primary_census_division__1.0,oc_primary_census_division__2.0,oc_primary_census_division__3.0,oc_primary_census_division__4.0,oc_primary_census_division__5.0,oc_primary_census_division__6.0,oc_primary_census_division__7.0,oc_primary_census_division__8.0,oc_primary_census_division__9.0
0,100001,0.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,100001,0.04,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,100001,0.08,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,100001,0.12,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,100001,0.16,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306085,77040,0.12,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306086,77040,0.16,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306087,77040,0.20,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
306088,77040,0.24,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [12]:
# Separate features and labels
X = data.drop(['dog_id'] + ['condition_type_' + condition_type for condition_type in [
    'Eye', 'Ear/Nose/Throat', 'Mouth/Dental/Oral', 'Skin', 'Cardiac', 'Respiratory',
    'Gastrointestinal', 'Liver/Pancreas', 'Kidney/Urinary', 'Reproductive', 'Bone/Orthopedic',
    'Brain/Neurologic', 'Endocrine', 'Hematopoietic', 'Other Congenital Disorder',
    'Infection/Parasites', 'Toxin Consumption', 'Trauma', 'Immune-mediated', 'cancer'
]], axis=1)

y_columns = ['condition_type_' + condition_type for condition_type in [
    'Eye', 'Ear/Nose/Throat', 'Mouth/Dental/Oral', 'Skin', 'Cardiac', 'Respiratory',
    'Gastrointestinal', 'Liver/Pancreas', 'Kidney/Urinary', 'Reproductive', 'Bone/Orthopedic',
    'Brain/Neurologic', 'Endocrine', 'Hematopoietic', 'Other Congenital Disorder',
    'Infection/Parasites', 'Toxin Consumption', 'Trauma', 'Immune-mediated', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

Modify from here

In [13]:
# Choose Model

# Initialize the Naive Bayes model
#nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
#ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize the Logistic Regression model
# lr_model = LogisticRegression()

# Wrap the model with OneVsRestClassifier
# ovr_classifier = OneVsRestClassifier(lr_model)

# Initialize the SVM model
clf1 = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, reg_lambda=1, reg_alpha=0.1, subsample=0.8, colsample_bytree=0.8)
clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
clf3 = LogisticRegression()

ensemble_model = VotingClassifier(
    estimators=[('xgb', clf1), ('gbc', clf2), ('lr', clf3)],
    voting='soft'
)
# Wrap the SVM model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(ensemble_model)

In [None]:

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=43)

# Lists to store AUC scores and sample counts
auc_scores_train_per_condition = {condition: [] for condition in y_columns}
auc_scores_val_per_condition = {condition: [] for condition in y_columns}
sample_counts_train_per_condition = {condition: [] for condition in y_columns}
sample_counts_val_per_condition = {condition: [] for condition in y_columns}
auc_scores_val_per_fold = []
auc_scores_train_per_fold = []

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]


    # Train the model
    ovr_classifier.fit(X_train, y_train)

    y_train_pred_proba = ovr_classifier.predict_proba(X_train)

    # Make predictions on the validation set
    y_val_pred_proba = ovr_classifier.predict_proba(X_val)

    # Make predictions on the training set

    # Calculate the AUC score for each disease
    auc_scores_train = [roc_auc_score(y_train[column], y_train_pred_proba[:, i]) for i, column in enumerate(y_train.columns)]
    auc_scores_val = [roc_auc_score(y_val[column], y_val_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    # Calculate AUC score for each disease
    fold_auc_val = roc_auc_score(y_val.values.ravel(), y_val_pred_proba.ravel())
    auc_scores_val_per_fold.append(fold_auc_val)

    fold_auc_train = roc_auc_score(y_train.values.ravel(), y_train_pred_proba.ravel())
    auc_scores_train_per_fold.append(fold_auc_train)

    # Append AUC scores and sample counts to lists
    #all_auc_scores.extend(auc_scores)
    #for i, condition in enumerate(y_columns):
    #    auc_scores_per_condition[condition].append(auc_scores)

    # get sample counts
    for i, condition in enumerate(y_columns):
        sample_count = y_train[condition].sum()  # Count of positive samples for the condition
        sample_counts_train_per_condition[condition].append(sample_count)

    for i, condition in enumerate(y_columns):
        sample_count = y_val[condition].sum()  # Count of positive samples for the condition
        sample_counts_val_per_condition[condition].append(sample_count)


    print(f"\nFold {fold+1} ========================================")
    print(f"\nFold {fold+1} AUC Scores for Diseases TRAINING-set:")
    for i, auc_score in enumerate(auc_scores_train, start=1):
        print(f"{y_train.columns[i-1]}: {auc_score}")
        auc_scores_train_per_condition[y_train.columns[i-1]].append(auc_score)

    print(f"\nFold {fold+1} AUC Scores for Diseases VALIDATION-set:")
    for i, auc_score in enumerate(auc_scores_val, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")
        auc_scores_val_per_condition[y_val.columns[i-1]].append(auc_score)


# Calculate average AUC scores
#weighted_average_auc_overall = sum(score * count for score, count in zip(all_auc_scores, sample_counts_per_condition['eye'])) / sum(sample_counts_per_condition['eye'])
#weighted_average_auc_per_condition = {condition: sum(score * count for score, count in zip(auc_scores, sample_counts)) / sum(sample_counts) for condition, (auc_scores, sample_counts) in zip(y_columns, sample_counts_per_condition.items())}

# Print average AUC scores
#print("\nWeighted Average AUC Scores:")
#print(f"Overall: {weighted_average_auc_overall}")
#for condition, weighted_avg_auc in weighted_average_auc_per_condition.items():
#    print(f"{condition}: {weighted_avg_auc}")



Fold 1 AUC Scores for Diseases TRAINING-set:
condition_type_Eye: 0.5034677086790396
condition_type_Ear/Nose/Throat: 0.5185095526223118
condition_type_Mouth/Dental/Oral: 0.4637026347571659
condition_type_Skin: 0.5445528052965992
condition_type_Cardiac: 0.45123981506601474
condition_type_Respiratory: 0.5013183695422356
condition_type_Gastrointestinal: 0.48955342445189576
condition_type_Liver/Pancreas: 0.39083130327522
condition_type_Kidney/Urinary: 0.5100970819283767
condition_type_Reproductive: 0.41251126938944715
condition_type_Bone/Orthopedic: 0.4840373330362152
condition_type_Brain/Neurologic: 0.5449616701092553
condition_type_Endocrine: 0.5105432650263511
condition_type_Hematopoietic: 0.4316069390301376
condition_type_Other Congenital Disorder: 0.5871147417717588
condition_type_Infection/Parasites: 0.4723348567488816
condition_type_Toxin Consumption: 0.46680867811190363
condition_type_Trauma: 0.49366765684019526
condition_type_Immune-mediated: 0.5058828543647769
condition_type_can

In [67]:
# Calculate average AUC per condition
average_auc_val_per_condition = {
    condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_val_per_condition.items()
}
# Calculate overall average AUC weighted by the number of samples
overall_average_auc_val_sum = 0
for auc_scores, sample_counts in zip(auc_scores_val_per_condition.values(), sample_counts_val_per_condition.values()):
    weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
    overall_average_auc_val_sum += weighted_sum / sum(sample_counts)

overall_average_auc_val = overall_average_auc_val_sum / len(auc_scores_val_per_condition)  # divide by the number of conditions

In [68]:
# Calculate average AUC per condition
average_auc_train_per_condition = {
    condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_train_per_condition.items()
}
# Calculate overall average AUC weighted by the number of samples
overall_average_auc_train_sum = 0
for auc_scores, sample_counts in zip(auc_scores_train_per_condition.values(), sample_counts_train_per_condition.values()):
    weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
    overall_average_auc_train_sum += weighted_sum / sum(sample_counts)

overall_average_auc_train = overall_average_auc_train_sum / len(auc_scores_train_per_condition)  # divide by the number of conditions

## AUC Scores

In [69]:
# Average AUC-score per condition
print("\n Average AUC val-score per condition:")
for condition, avg_auc in average_auc_val_per_condition.items():
    print(f"{condition}: {avg_auc}")


 Average AUC val-score per condition:
condition_type_Eye: 0.6203980569691803
condition_type_Ear/Nose/Throat: 0.5951002302904156
condition_type_Mouth/Dental/Oral: 0.6413008734792947
condition_type_Skin: 0.5487366734254697
condition_type_Cardiac: 0.6693175203649747
condition_type_Respiratory: 0.6142737549182051
condition_type_Gastrointestinal: 0.5475239553577447
condition_type_Liver/Pancreas: 0.678968912263145
condition_type_Kidney/Urinary: 0.5958088715543826
condition_type_Reproductive: 0.6937450718669015
condition_type_Bone/Orthopedic: 0.6014549352908356
condition_type_Brain/Neurologic: 0.6578105720135298
condition_type_Endocrine: 0.6768526557371178
condition_type_Hematopoietic: 0.6122547397703986
condition_type_Other Congenital Disorder: 0.7764011658378419
condition_type_Infection/Parasites: 0.7244747141064518
condition_type_Toxin Consumption: 0.590059519029208
condition_type_Trauma: 0.5707637785165709
condition_type_Immune-mediated: 0.47690714183320876
condition_type_cancer: 0.71570

In [71]:
# Get average AUC values as a list
average_auc_val_list = list(average_auc_val_per_condition.values())
average_auc_train_list = list(average_auc_train_per_condition.values())

# Calculate standard deviation of average AUC values
overall_auc_val_std = np.std(average_auc_val_list)
overall_auc_train_std = np.std(average_auc_train_list)

# Print average AUC scores
print("\nWeighted Average AUC Score:")
#print(f"Overall: {overall_average_auc}")
print(f"Train: {overall_average_auc_train * 100:.2f}% ± {overall_auc_train_std * 100:.2f}%")
print(f"Val: {overall_average_auc_val * 100:.2f}% ± {overall_auc_val_std * 100:.2f}%")


Weighted Average AUC Score:
Train: 63.87% ± 6.39%
Val: 63.04% ± 6.88%


## Archiv

In [62]:
from sklearn.linear_model import LogisticRegression

# Separate features and labels
X = data.drop(['dog_id'] + ['condition_type_' + condition_type for condition_type in [
    'Eye', 'Ear/Nose/Throat', 'Mouth/Dental/Oral', 'Skin', 'Cardiac', 'Respiratory',
    'Gastrointestinal', 'Liver/Pancreas', 'Kidney/Urinary', 'Reproductive', 'Bone/Orthopedic',
    'Brain/Neurologic', 'Endocrine', 'Hematopoietic', 'Other Congenital Disorder',
    'Infection/Parasites', 'Toxin Consumption', 'Trauma', 'Immune-mediated', 'cancer'
]], axis=1)

y_columns = ['condition_type_' + condition_type for condition_type in [
    'Eye', 'Ear/Nose/Throat', 'Mouth/Dental/Oral', 'Skin', 'Cardiac', 'Respiratory',
    'Gastrointestinal', 'Liver/Pancreas', 'Kidney/Urinary', 'Reproductive', 'Bone/Orthopedic',
    'Brain/Neurologic', 'Endocrine', 'Hematopoietic', 'Other Congenital Disorder',
    'Infection/Parasites', 'Toxin Consumption', 'Trauma', 'Immune-mediated', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Logistic Regression model
lr_model = LogisticRegression()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(lr_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=43)

# Lists to store AUC scores and sample counts
auc_scores_per_condition = {condition: [] for condition in y_columns}
sample_counts_per_condition = {condition: [] for condition in y_columns}

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    # Append AUC scores and sample counts to lists
    for i, condition in enumerate(y_columns):
        sample_count = y_val[condition].sum()  # Count of positive samples for the condition
        sample_counts_per_condition[condition].append(sample_count)

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")
        auc_scores_per_condition[y_val.columns[i-1]].append(auc_score)

# Print the average AUC per condition and overall average AUC
average_auc_per_condition = {
    condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_per_condition.items()
}

# Calculate overall average AUC weighted by the number of samples
overall_average_auc_sum = 0
for auc_scores, sample_counts in zip(auc_scores_per_condition.values(), sample_counts_per_condition.values()):
    weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
    overall_average_auc_sum += weighted_sum / sum(sample_counts)

overall_average_auc = overall_average_auc_sum / len(auc_scores_per_condition)

print("\nAverage AUC Scores per Condition:")
for condition, average_auc in average_auc_per_condition.items():
    print(f"{condition}: {average_auc * 100:.2f}%")

print(f"\nOverall Average AUC: {overall_average_auc * 100:.2f}%")



Fold 1 AUC Scores for Diseases:
condition_type_Eye: 0.6156800302781757
condition_type_Ear/Nose/Throat: 0.5764634065359958
condition_type_Mouth/Dental/Oral: 0.6439636235794226
condition_type_Skin: 0.5421263539272966
condition_type_Cardiac: 0.6658092727045754
condition_type_Respiratory: 0.6308743068624443
condition_type_Gastrointestinal: 0.5543951416985478
condition_type_Liver/Pancreas: 0.6746563138210352
condition_type_Kidney/Urinary: 0.6134421577031687
condition_type_Reproductive: 0.658360747532722
condition_type_Bone/Orthopedic: 0.6121594595881432
condition_type_Brain/Neurologic: 0.6712106520849578
condition_type_Endocrine: 0.6939884356874864
condition_type_Hematopoietic: 0.6533272783415801
condition_type_Other Congenital Disorder: 0.7793439070410432
condition_type_Infection/Parasites: 0.7259493978671484
condition_type_Toxin Consumption: 0.5871456864069124
condition_type_Trauma: 0.5777791517529742
condition_type_Immune-mediated: 0.46052218789571664
condition_type_cancer: 0.7211292154

In [63]:
# Average AUC-score per condition
print("\n Average AUC Score per condition:")
for condition, avg_auc in average_auc_per_condition.items():
    print(f"{condition}: {avg_auc}")


 Average AUC Score per condition:
condition_type_Eye: 0.6203980569691803
condition_type_Ear/Nose/Throat: 0.5951002302904156
condition_type_Mouth/Dental/Oral: 0.6413008734792947
condition_type_Skin: 0.5487366734254697
condition_type_Cardiac: 0.6693175203649747
condition_type_Respiratory: 0.6142737549182051
condition_type_Gastrointestinal: 0.5475239553577447
condition_type_Liver/Pancreas: 0.678968912263145
condition_type_Kidney/Urinary: 0.5958088715543826
condition_type_Reproductive: 0.6937450718669015
condition_type_Bone/Orthopedic: 0.6014549352908356
condition_type_Brain/Neurologic: 0.6578105720135298
condition_type_Endocrine: 0.6768526557371178
condition_type_Hematopoietic: 0.6122547397703986
condition_type_Other Congenital Disorder: 0.7764011658378419
condition_type_Infection/Parasites: 0.7244747141064518
condition_type_Toxin Consumption: 0.590059519029208
condition_type_Trauma: 0.5707637785165709
condition_type_Immune-mediated: 0.47690714183320876
condition_type_cancer: 0.715706957

In [64]:
# Get average AUC values as a list
average_auc_list = list(average_auc_per_condition.values())

# Calculate standard deviation of average AUC values
overall_auc_std = np.std(average_auc_list)

# Print average AUC scores
print("\nWeighted Average AUC Score:")
#print(f"Overall: {overall_average_auc}")
print(f"{overall_average_auc * 100:.2f}% ± {overall_auc_std * 100:.2f}%")


Weighted Average AUC Score:
63.04% ± 6.88%


# Archiv

In [None]:
from sklearn.svm import SVC
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score

# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the SVM model
svm_model = SVC(probability=True)

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(svm_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=43)

# Lists to store AUC scores and sample counts
auc_scores_per_condition = {condition: [] for condition in y_columns}
sample_counts_per_condition = {condition: [] for condition in y_columns}

for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    with tqdm(total=100, desc=f"Fold {fold+1}", position=0, leave=True) as pbar:
        ovr_classifier.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred_proba = ovr_classifier.predict_proba(X_val)

        # Calculate the AUC score for each disease
        auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

        # Append AUC scores and sample counts to lists
        for i, condition in enumerate(y_columns):
            sample_count = y_val[condition].sum()  # Count of positive samples for the condition
            sample_counts_per_condition[condition].append(sample_count)

        # Update tqdm progress bar
        pbar.update(100)

        print(f"\nFold {fold+1} AUC Scores for Diseases:")
        for i, auc_score in enumerate(auc_scores, start=1):
            print(f"{y_val.columns[i-1]}: {auc_score}")
            auc_scores_per_condition[y_val.columns[i-1]].append(auc_score)

# Calculate average AUC per condition
average_auc_per_condition = {
    condition: sum(auc_scores) / len(auc_scores) for condition, auc_scores in auc_scores_per_condition.items()
}

# Calculate overall average AUC weighted by the number of samples
overall_average_auc_sum = 0
for auc_scores, sample_counts in zip(auc_scores_per_condition.values(), sample_counts_per_condition.values()):
    weighted_sum = sum(auc * sample_count for auc, sample_count in zip(auc_scores, sample_counts))
    overall_average_auc_sum += weighted_sum / sum(sample_counts)

overall_average_auc = overall_average_auc_sum / len(auc_scores_per_condition)

print("\nAverage AUC Scores for Diseases:")
for condition, average_auc in average_auc_per_condition.items():
    print(f"{condition}: {average_auc * 100:.2f}%")

print(f"\nOverall Average AUC: {overall_average_auc * 100:.2f}%")

Fold 1:   0%|          | 0/100 [00:00<?, ?it/s]

In [70]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Train the model
ovr_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_proba = ovr_classifier.predict_proba(X_val)

# Calculate the AUC score for each disease
auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_columns)]

print("AUC Scores for Diseases:")
for i, auc_score in enumerate(auc_scores, start=1):
    print(f"{y_columns[i-1]}: {auc_score}")

AUC Scores for Diseases:
hs_health_conditions_eye: 0.608296178184405
hs_health_conditions_ear: 0.602082745434546
hs_health_conditions_oral: 0.673323984807287
hs_health_conditions_skin: 0.5737486832374212
hs_health_conditions_cardiac: 0.6757555014704391
hs_health_conditions_respiratory: 0.6597351834286939
hs_health_conditions_gastrointestinal: 0.5443517991312358
hs_health_conditions_liver: 0.6564524030823188
hs_health_conditions_kidney: 0.6286350350740452
hs_health_conditions_reproductive: 0.6277588549460594
hs_health_conditions_orthopedic: 0.5766742123729287
hs_health_conditions_neurological: 0.5726241353246092
hs_health_conditions_endocrine: 0.6105098605098606
hs_health_conditions_hematologic: 0.551430531692185
hs_health_conditions_immune: 0.6201361546965611
hs_health_conditions_infectious_disease: 0.5760720614902737
hs_health_conditions_toxin_consumption: 0.5670980079825317
hs_health_conditions_trauma: 0.6000221375634803
hs_health_conditions_cancer: 0.6025861321497534


In [71]:
model_filepath = '../models/naive_base_with_breedtop50.joblib'
joblib.dump(ovr_classifier, model_filepath)

['../models/naive_base_with_breedtop50.joblib']

In [72]:
loaded_model_filepath = '../models/naive_base_with_breedtop50.joblib'
loaded_ovr_classifier = joblib.load(loaded_model_filepath)

In [None]:
# Assuming X_val is your validation set DataFrame
# Select a random row (Datapoint) from X_val
random_index = np.random.choice(X_val.index)

In [73]:
random_data_point = X_val.loc[random_index]
random_data_point

breeds_American Pitbull Terrier              0
breeds_American Staffordshire Terrier        0
breeds_Australian Cattle Dog                 0
breeds_Australian Shepherd                   0
breeds_Basset Hound                          0
                                         ...  
other_hard                                True
grass_dirt                                True
gravel                                   False
sand                                      True
astroturf                                False
Name: 18995, Length: 126, dtype: object

In [74]:

# Reshape the data point to a 2D array
random_data_point_reshaped = random_data_point.values.reshape(1, -1)

# Make predictions on the reshaped data point using the loaded model
new_data_predictions_proba = loaded_ovr_classifier.predict_proba(random_data_point_reshaped)
new_data_predictions_proba



array([[6.71306308e-01, 4.46896673e-01, 9.48417250e-01, 3.22899883e-01,
        5.45953211e-01, 5.36113476e-01, 2.81684923e-01, 2.39306906e-01,
        2.41839342e-01, 4.15624281e-04, 3.53040030e-01, 1.71186483e-01,
        1.14720288e-01, 2.66393278e-03, 3.92847558e-03, 1.27038346e-01,
        3.37030138e-01, 1.28058841e-01, 1.19044821e-01]])

In [4]:
# Merge the two dataframes based on dog_id
data = pd.merge(feat_breed_top50_v2, feat_disease_output_binary, on='dog_id')

# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]
y

Unnamed: 0,hs_health_conditions_eye,hs_health_conditions_ear,hs_health_conditions_oral,hs_health_conditions_skin,hs_health_conditions_cardiac,hs_health_conditions_respiratory,hs_health_conditions_gastrointestinal,hs_health_conditions_liver,hs_health_conditions_kidney,hs_health_conditions_reproductive,hs_health_conditions_orthopedic,hs_health_conditions_neurological,hs_health_conditions_endocrine,hs_health_conditions_hematologic,hs_health_conditions_immune,hs_health_conditions_infectious_disease,hs_health_conditions_toxin_consumption,hs_health_conditions_trauma,hs_health_conditions_cancer
0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33167,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
33168,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33169,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
33170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [18]:
from sklearn.utils.validation import *
y_2 = check_array(y, input_name="y", ensure_2d=False, dtype=None)
if y_2.ndim == 2:
    # for multi-label y, map each distinct row to a string repr
    # using join because str(row) uses an ellipsis if len(row) > 1000
    y_2 = np.array([" ".join(row.astype("str")) for row in y_2])
print('line 144 y.shape: ', y.shape)

classes, y_indices = np.unique(y_2, return_inverse=True)
#print(classes)
n_classes = classes.shape[0]

class_counts = np.bincount(y_indices)
np.min(class_counts) < 2
np.bincount(y_indices)
y_indices

line 144 y.shape:  (33172, 19)


array([ 590,  546, 2513, ...,    8,    8, 2631])

In [20]:
# Check if the NumPy array and DataFrame are the same
if not np.array_equal(y_2, y.values):
    print("NumPy array and DataFrame are different")

    # Show the differences
    differences = y_2 - y.values
    print("Differences:")
    print(differences)
else:
    print("NumPy array and DataFrame are the same")

NumPy array and DataFrame are the same


In [7]:
list_sum = []
for column in y.columns:
    ones = y[column].sum()
    list_sum.append(ones)
list_sum

[4409,
 4306,
 9161,
 9512,
 1947,
 1170,
 4752,
 1154,
 2586,
 777,
 6329,
 1606,
 1098,
 182,
 270,
 8854,
 3645,
 9195,
 2101]

In [5]:

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Use the split method to get the train and test indices
for train_index, test_index in sss.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Oversample each disease separately
X_resampled_list = []
y_resampled_list = []

ros = RandomOverSampler(random_state=42)

for column in y_train.columns:
    X_resampled_disease, y_resampled_disease = ros.fit_resample(X_train, y_train[column])
    X_resampled_list.append(X_resampled_disease)
    y_resampled_list.append(y_resampled_disease)

X_resampled = pd.concat(X_resampled_list, axis=1)
y_resampled = pd.concat(y_resampled_list, axis=1)

# Train the model
ovr_classifier.fit(X_resampled, y_resampled)

# Make predictions on the validation set
y_pred_proba = ovr_classifier.predict_proba(X_val)

# Calculate the AUC score for each disease
auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

print("AUC Scores for Diseases:")
for i, auc_score in enumerate(auc_scores, start=1):
    print(f"{y_val.columns[i-1]}: {auc_score}")

y.shape: (33172, 19)
y.shape:  (33172, 19)
y.shape:  (33172, 19)
line 2132: y.shape:  (33172, 19)
line 144 y.shape:  (33172,)
y: 
 ['0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0'
 '0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0' ...
 '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0'
 '0 1 1 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0']
y_indices: 
 [ 590  546 2513 ...    8    8 2631]


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
#for (i, auc_score), (j, auc_score_2) in zip(enumerate(auc_scores, start=1), enumerate(auc_scores_2, start=1)):
#    difference = auc_score - auc_score_2
#    print(f"{y_columns[i-1]}: auc_score={auc_score} | auc_score_2={auc_score_2} | Difference={difference}")

In [122]:
# Merge the two dataframes based on dog_id
#data = pd.merge(features_breed_group_v2, features_breed_top50_v2, on='dog_id')

# List of DataFrames to be merged
list_input_features = features_list  # Add more DataFrames as needed

# Merge DataFrames iteratively using reduce
input_features = reduce(lambda left, right: pd.merge(left, right, on='dog_id'), list_input_features)

# Merge with disease output feature
data = pd.merge(input_features, feat_disease_output_binary, on='dog_id')

In [118]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Iterate through the diseases
for disease_column in y_columns:
    print(f"\nDisease: {disease_column}")

    # Get the target variable for the current disease
    y_current_disease = data[disease_column]

    # Initialize StratifiedKFold
    n_splits = 5  # You can adjust the number of splits as needed
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Iterate through the splits
    for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y_current_disease)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y_current_disease.iloc[train_index], y_current_disease.iloc[val_index]

        # Train the model
        ovr_classifier.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred_proba = ovr_classifier.predict_proba(X_val)[:, 1]  # Assuming you want the probability of the positive class

        # Calculate the AUC score for the disease
        auc_score = roc_auc_score(y_val, y_pred_proba)

        print(f"Fold {fold+1} AUC Score: {auc_score}")



Disease: hs_health_conditions_eye
Fold 1 AUC Score: 0.5889365274163414
Fold 2 AUC Score: 0.5923737349239456
Fold 3 AUC Score: 0.6015350683219536
Fold 4 AUC Score: 0.5955443072647536
Fold 5 AUC Score: 0.5991581377167428

Disease: hs_health_conditions_ear
Fold 1 AUC Score: 0.585866429255166
Fold 2 AUC Score: 0.60439846604382
Fold 3 AUC Score: 0.6089949136260728
Fold 4 AUC Score: 0.5926083200626733
Fold 5 AUC Score: 0.6039063132234951

Disease: hs_health_conditions_oral
Fold 1 AUC Score: 0.6738195759463118
Fold 2 AUC Score: 0.6648677406015212
Fold 3 AUC Score: 0.664522331530314
Fold 4 AUC Score: 0.6603683788159281
Fold 5 AUC Score: 0.6505481913781567

Disease: hs_health_conditions_skin
Fold 1 AUC Score: 0.567172297039507
Fold 2 AUC Score: 0.5822176090054009
Fold 3 AUC Score: 0.5702938451207149
Fold 4 AUC Score: 0.5612502340687936
Fold 5 AUC Score: 0.5644397845921392

Disease: hs_health_conditions_cardiac
Fold 1 AUC Score: 0.6948982902828138
Fold 2 AUC Score: 0.6455586360658586
Fold 3 AUC

In [119]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")



Fold 1 AUC Scores for Diseases:
hs_health_conditions_eye: 0.5911374057345359
hs_health_conditions_ear: 0.5975891893278246
hs_health_conditions_oral: 0.6636215962728014
hs_health_conditions_skin: 0.5765633374012875
hs_health_conditions_cardiac: 0.659720136041126
hs_health_conditions_respiratory: 0.657287714699593
hs_health_conditions_gastrointestinal: 0.5506340041052006
hs_health_conditions_liver: 0.6392620489724077
hs_health_conditions_kidney: 0.6529590641141956
hs_health_conditions_reproductive: 0.658563047780605
hs_health_conditions_orthopedic: 0.5990176682539723
hs_health_conditions_neurological: 0.5318584877863988
hs_health_conditions_endocrine: 0.5954687021708894
hs_health_conditions_hematologic: 0.5497067448680352
hs_health_conditions_immune: 0.6451882134632272
hs_health_conditions_infectious_disease: 0.5702841001926019
hs_health_conditions_toxin_consumption: 0.5470477110827017
hs_health_conditions_trauma: 0.605545182331868
hs_health_conditions_cancer: 0.6086773006966928

Fold 2

In [120]:
# Separate features and labels
X = data.drop(['dog_id'] + ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]], axis=1)

y_columns = ['hs_health_conditions_' + condition for condition in [
    'eye', 'ear', 'oral', 'skin', 'cardiac', 'respiratory', 'gastrointestinal',
    'liver', 'kidney', 'reproductive', 'orthopedic', 'neurological', 'endocrine',
    'hematologic', 'immune', 'infectious_disease', 'toxin_consumption', 'trauma', 'cancer'
]]
y = data[y_columns]

# Convert y to a binary format
y_binary = (y == 1)

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Wrap the model with OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(nb_model)

# Initialize MultilabelStratifiedKFold
n_splits = 5  # You can adjust the number of splits as needed
ml_stratified_kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store AUC scores
all_auc_scores = []
health_condition_auc_scores = {condition: [] for condition in y_columns}

# Iterate through the splits
for fold, (train_index, val_index) in enumerate(ml_stratified_kfold.split(X, y_binary)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_binary.iloc[train_index], y_binary.iloc[val_index]

    # Train the model
    ovr_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_proba = ovr_classifier.predict_proba(X_val)

    # Calculate the AUC score for each disease
    auc_scores = [roc_auc_score(y_val[column], y_pred_proba[:, i]) for i, column in enumerate(y_val.columns)]

    # Append AUC scores to lists
    all_auc_scores.extend(auc_scores)
    for i, condition in enumerate(y_columns):
        health_condition_auc_scores[condition].append(auc_scores[i])

    print(f"\nFold {fold+1} AUC Scores for Diseases:")
    for i, auc_score in enumerate(auc_scores, start=1):
        print(f"{y_val.columns[i-1]}: {auc_score}")

# Calculate average AUC scores
average_auc_overall = sum(all_auc_scores) / len(all_auc_scores)
average_auc_per_condition = {condition: sum(scores) / len(scores) for condition, scores in health_condition_auc_scores.items()}

# Print average AUC scores
print("\nAverage AUC Scores:")
print(f"Overall: {average_auc_overall}")
for condition, avg_auc in average_auc_per_condition.items():
    print(f"{condition}: {avg_auc}")


Fold 1 AUC Scores for Diseases:
hs_health_conditions_eye: 0.5911374057345359
hs_health_conditions_ear: 0.5975891893278246
hs_health_conditions_oral: 0.6636215962728014
hs_health_conditions_skin: 0.5765633374012875
hs_health_conditions_cardiac: 0.659720136041126
hs_health_conditions_respiratory: 0.657287714699593
hs_health_conditions_gastrointestinal: 0.5506340041052006
hs_health_conditions_liver: 0.6392620489724077
hs_health_conditions_kidney: 0.6529590641141956
hs_health_conditions_reproductive: 0.658563047780605
hs_health_conditions_orthopedic: 0.5990176682539723
hs_health_conditions_neurological: 0.5318584877863988
hs_health_conditions_endocrine: 0.5954687021708894
hs_health_conditions_hematologic: 0.5497067448680352
hs_health_conditions_immune: 0.6451882134632272
hs_health_conditions_infectious_disease: 0.5702841001926019
hs_health_conditions_toxin_consumption: 0.5470477110827017
hs_health_conditions_trauma: 0.605545182331868
hs_health_conditions_cancer: 0.6086773006966928

Fold 2