In [6]:
from src.utilities_rejection import RejectionReasonLabeling, label_reasons, label_reasons_column
from cashe import get_train_test_cashe, get_training_inputs, drop_nomodel_columns
from sklearn.metrics import classification_report
from src.data_local import MergedDataPreprocessing
from src.model_train import ModelsSearchEngine
from src.model_app import ModelApplication
from src.lstm_encoder import LSTMEmbedding
from src.model_train import XGBoostTuning

df_train, df_test = get_train_test_cashe(date_day='03-09',drop_duplicates=False) ## No need to re-drop duplicates

In [7]:
reasons = ['MN-1-1','SE-1-6','CV-1-1','BE-1-6','BE-1-7']
reasons = ['MN-1-1','CV-1-1']
reasons = ['MN-1-1']

train_rej = RejectionReasonLabeling(df_train)
train_rej.recoginze_label()
train_rej = RejectionReasonLabeling(df_test)
train_rej.recoginze_label()

df_train, df_test = label_reasons(df_train,df_test,reasons)

print('data is labeled with medical reason, and the billing reason')
X_train, y_train, X_test, y_test = get_training_inputs(df_train, df_test)
y_train.loc[:, 'NPHIES_LABEL'] = df_train['NPHIES_LABEL'].tolist()
y_test.loc[:, 'NPHIES_LABEL'] = df_test['NPHIES_LABEL'].tolist()

X_train.drop(columns=['NPHIES_CODE'],inplace=True); X_test.drop(columns=['NPHIES_CODE'],inplace=True)

print('data is loaded successfully')
preprocessing_train = MergedDataPreprocessing(X_train)
X_train_prep = preprocessing_train.columns_prep()

preprocessing_test = MergedDataPreprocessing(X_test)
X_test_prep = preprocessing_test.columns_prep()

print('data is preprocessed numerically')

data is labeled with medical reason, and the billing reason
data is loaded successfully
data is preprocessed numerically


In [8]:
lstm_embedding = LSTMEmbedding()
dropping_check = True

X_train_encoded = preprocessing_train.column_embedding(X_train_prep,drop_after_processing=dropping_check)
X_test_encoded  = preprocessing_train.column_embedding(X_test_prep, drop_after_processing=dropping_check)

print('data is encoded')

data is encoded


In [9]:
dropping_check = False

X_train = drop_nomodel_columns(X_train_encoded, eliminate_repeated=dropping_check)
X_test  = drop_nomodel_columns(X_test_encoded,  eliminate_repeated=dropping_check)

In [10]:
outcomes_train = label_reasons_column(y_train);  outcomes_test  = label_reasons_column(y_test)

y_train = y_train.copy();  y_test = y_test.copy()
y_train.loc[:, 'OUTCOME_MERGED'] = outcomes_train;  y_test.loc[:, 'OUTCOME_MERGED'] = outcomes_test

In [18]:
path_to_save = 'drafts/Hady_Folder/'

#X_train.to_csv(f'{path_to_save}train.csv',index=False); X_test.to_csv(f'{path_to_save}test.csv',index=False); y_train.to_csv(f'{path_to_save}y_train.csv',index=False); y_test.to_csv(f'{path_to_save}y_test.csv',index=False)

## Choose a model

In [7]:
X_train.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

model_trial = ModelsSearchEngine(X_train=X_train,y_train=y_train['OUTCOME_MERGED'],
                            X_test=X_test,y_test=y_test['OUTCOME_MERGED'])

model_trial.train_models()

[LightGBM] [Info] Total Bins 6996
[LightGBM] [Info] Number of data points in the train set: 209680, number of used features: 88
[LightGBM] [Info] Start training from score -1.076823
[LightGBM] [Info] Start training from score -0.592579
[LightGBM] [Info] Start training from score -2.240323


LightGBM, Decision Tree, SGD and Neural Network are trained on dataset.


In [8]:
result_analysis = model_trial.evaluate_models()
result_analysis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Decision Tree': {'Accuracy': 0.63,
  'Precision': 0.63,
  'Recall': 0.63,
  'F1 Score': 0.63},
 'LightGBM': {'Accuracy': 0.69,
  'Precision': 0.71,
  'Recall': 0.69,
  'F1 Score': 0.7},
 'SGD Classifier': {'Accuracy': 0.5,
  'Precision': 0.53,
  'Recall': 0.5,
  'F1 Score': 0.51},
 'XGBoost': {'Accuracy': 0.71,
  'Precision': 0.72,
  'Recall': 0.71,
  'F1 Score': 0.71},
 'Neural Network': {'Accuracy': 0.62,
  'Precision': 0.55,
  'Recall': 0.62,
  'F1 Score': 0.58}}

## Tune XGB

In [None]:
param_grid = {
    'n_estimators': [100,150,200],
    'max_depth': [9,11],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'subsample': [ 0.9, 1.0]     }

param_grid = {
    'n_estimators': [150,200],
    'max_depth': [11,13,15,18],
    'learning_rate': [0.1, 0.2, 0.3],
    'subsample': [ 0.9]   }

trainer = XGBoostTuning(X_train, y_train['OUTCOME_MERGED'], X_test, y_test['OUTCOME_MERGED'])
best_model, best_params, best_score = trainer.train_and_evaluate(param_grid)

trainer.save_results(file_path='drafts/Reports/2024_09_11/xgboost_parameters.xlsx')

## Train XGB

In [11]:
y_train['NPHIES_LABEL'].value_counts()

NPHIES_LABEL
0    210921
2     25524
Name: count, dtype: int64

In [12]:
model_runner = ModelApplication(X_train= X_train,y_train= y_train['OUTCOME'],
                                X_test= X_test,y_test= y_test['OUTCOME'],
                                enable_categorical=True)

model_runner.get_model(force_retrain=True,model_name='outcome')
model_runner.evaluate_model()



[0]	validation_0-aucpr:0.72218
[1]	validation_0-aucpr:0.72753
[2]	validation_0-aucpr:0.74353
[3]	validation_0-aucpr:0.74843
[4]	validation_0-aucpr:0.75158
[5]	validation_0-aucpr:0.75982
[6]	validation_0-aucpr:0.76378
[7]	validation_0-aucpr:0.76692
[8]	validation_0-aucpr:0.76900
[9]	validation_0-aucpr:0.77451
[10]	validation_0-aucpr:0.77553
[11]	validation_0-aucpr:0.77653
[12]	validation_0-aucpr:0.77423
[13]	validation_0-aucpr:0.77611
[14]	validation_0-aucpr:0.77847
[15]	validation_0-aucpr:0.78042
[16]	validation_0-aucpr:0.77902
[17]	validation_0-aucpr:0.77981
[18]	validation_0-aucpr:0.78396
[19]	validation_0-aucpr:0.78507
[20]	validation_0-aucpr:0.78580
[21]	validation_0-aucpr:0.78589
[22]	validation_0-aucpr:0.78632
[23]	validation_0-aucpr:0.78701
[24]	validation_0-aucpr:0.78762
[25]	validation_0-aucpr:0.78796
[26]	validation_0-aucpr:0.78848
[27]	validation_0-aucpr:0.78936
[28]	validation_0-aucpr:0.78954
[29]	validation_0-aucpr:0.78976
[30]	validation_0-aucpr:0.79033
[31]	validation_0-

{'XGBoost': {'Accuracy': 0.73,
  'Precision': 0.71,
  'Recall': 0.77,
  'F1 Score': 0.74}}

In [13]:
PREDS = list(model_runner.model_predict(X_test))
print(classification_report(y_test['OUTCOME'].tolist(), PREDS))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71     29110
           1       0.71      0.77      0.74     30002

    accuracy                           0.73     59112
   macro avg       0.73      0.72      0.72     59112
weighted avg       0.73      0.73      0.72     59112



In [18]:
model_runner = ModelApplication(X_train= X_train,y_train= y_train['NPHIES_LABEL'].replace(2,1),
                                X_test= X_test,y_test= y_test['NPHIES_LABEL'].replace(2,1),
                                enable_categorical=True)

model_runner.get_model(force_retrain=True,model_name='medical')
model_runner.evaluate_model()



[0]	validation_0-aucpr:0.23179
[1]	validation_0-aucpr:0.23435
[2]	validation_0-aucpr:0.23989
[3]	validation_0-aucpr:0.24577
[4]	validation_0-aucpr:0.26184
[5]	validation_0-aucpr:0.26521
[6]	validation_0-aucpr:0.26058
[7]	validation_0-aucpr:0.26164
[8]	validation_0-aucpr:0.27706
[9]	validation_0-aucpr:0.28128
[10]	validation_0-aucpr:0.28384
[11]	validation_0-aucpr:0.28304
[12]	validation_0-aucpr:0.28241
[13]	validation_0-aucpr:0.28444
[14]	validation_0-aucpr:0.29337
[15]	validation_0-aucpr:0.29325
[16]	validation_0-aucpr:0.29902
[17]	validation_0-aucpr:0.30027
[18]	validation_0-aucpr:0.30201
[19]	validation_0-aucpr:0.30847
[20]	validation_0-aucpr:0.30872
[21]	validation_0-aucpr:0.31085
[22]	validation_0-aucpr:0.31489
[23]	validation_0-aucpr:0.31584
[24]	validation_0-aucpr:0.31713
[25]	validation_0-aucpr:0.31794
[26]	validation_0-aucpr:0.32022
[27]	validation_0-aucpr:0.32024
[28]	validation_0-aucpr:0.31847
[29]	validation_0-aucpr:0.31853
[30]	validation_0-aucpr:0.31883
[31]	validation_0-

{'XGBoost': {'Accuracy': 0.91,
  'Precision': 0.51,
  'Recall': 0.25,
  'F1 Score': 0.33}}

In [16]:
PREDS = list(model_runner.model_predict(X_test))
print(classification_report(y_test['NPHIES_LABEL'].replace(2,1), PREDS))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     53975
           1       0.51      0.25      0.33      5137

    accuracy                           0.91     59112
   macro avg       0.72      0.61      0.64     59112
weighted avg       0.90      0.91      0.90     59112



In [9]:
def get_output_test_set(df_test):
    apply_dict = {0:'Rejected-Generally',1:"Approved",2:"Rejected-Medically"}
    PREDS = list(model_runner.model_predict(X_test))
    PREDS_DECODED = [apply_dict.get(item, item) for item in PREDS]

    last_three_columns = ['OUTCOME', 'NPHIES_CODE', 'NPHIES_LABEL']
    other_columns = [col for col in df_test.columns if col not in last_three_columns]
    df_test_preds = df_test[other_columns + last_three_columns]
    df_test_preds['PREDICTIONS'] = PREDS_DECODED
    return df_test_preds

df_test_preds = get_output_test_set(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_preds['PREDICTIONS'] = PREDS_DECODED


In [40]:
df_test_preds

Unnamed: 0,REQUEST_DATE,VISIT_NO,VISIT_DATE,EMERGENCY_INDICATOR,PROVIDER_DEPARTMENT_CODE,PROVIDER_DEPARTMENT,DOCTOR_SPECIALTY_CODE,DOCTOR_CODE,PATIENT_NO,EPISODE_NO,...,DISCOUNT_PERCENTAGE,NOTES,APPROVED_QUNATITY,ICD10,Diagnosis_Key,Chief_Complaint,OUTCOME,NPHIES_CODE,NPHIES_LABEL,PREDICTIONS
365589,2024-06-02 21:59:23,1149844-2,2024-05-29,N,21,ORTHOPAEDIC SERVICE - العظام,22.05,4946,997446,2,...,0.00,-,1.0,M75.0,110209631,left shoulder pain no truma 1w over use during...,APPROVED,,0,Approved
305745,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,6.99,- BE-1-7,0.0,B07,10160324,,REJECTED,BE-1-7,0,Approved
305744,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,30.00,-,1.0,B07,10160324,,APPROVED,,0,Approved
307086,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,0.00,- CV-4-7,0.0,B07,10160324,,REJECTED,,0,Approved
307088,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,0.00,- CV-4-7,0.0,B07,10160324,,REJECTED,,0,Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352722,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,4.99,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352721,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,5.00,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352720,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,5.04,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352719,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,20.00,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Rejected-Generally


In [16]:
df_test.to_csv('predictions.csv',index=False)