In [1]:
from src.utilities_rejection import RejectionReasonLabeling, label_reasons, label_reasons_column
from cashe import get_train_test_cashe, get_training_inputs, drop_nomodel_columns
from src.data_local import MergedDataPreprocessing
from src.model_train import ModelsSearchEngine
from src.model_app import ModelApplication
from src.lstm_encoder import LSTMEmbedding
from src.model_train import XGBoostTuning

In [2]:
df_train, df_test = get_train_test_cashe(date_day='31-08',drop_duplicates=False) ## No need to drop duplicates again
#reasons = ['MN-1-1','SE-1-6','CV-1-1','BE-1-6','BE-1-7']
reasons = ['MN-1-1']

train_rej = RejectionReasonLabeling(df_train)
train_rej.recoginze_label()
train_rej = RejectionReasonLabeling(df_test)
train_rej.recoginze_label()

df_train, df_test = label_reasons(df_train,df_test,reasons)

print('data is labeled with medical reason, and the billing reason')
X_train, y_train, X_test, y_test = get_training_inputs(df_train, df_test)
y_train.loc[:, 'NPHIES_LABEL'] = df_train['NPHIES_LABEL'].tolist()
y_test.loc[:, 'NPHIES_LABEL'] = df_test['NPHIES_LABEL'].tolist()

X_train.drop(columns=['NPHIES_CODE'],inplace=True); X_test.drop(columns=['NPHIES_CODE'],inplace=True)

print('data is loaded successfully')
preprocessing_train = MergedDataPreprocessing(X_train)
X_train_prep = preprocessing_train.columns_prep()

preprocessing_test = MergedDataPreprocessing(X_test)
X_test_prep = preprocessing_test.columns_prep()
print('data is preprocessed numerically')

data is labeled with medical reason, and the billing reason
data is loaded successfully
data is preprocessed numerically


In [3]:
lstm_embedding = LSTMEmbedding()

X_train_encoded = preprocessing_train.column_embedding(X_train_prep)
X_test_encoded = preprocessing_train.column_embedding(X_test_prep)

X_train = drop_nomodel_columns(X_train_encoded)
X_test  = drop_nomodel_columns(X_test_encoded)
print('data is encoded')

data is encoded


In [4]:
#X_train.to_parquet('train.parquet'); X_test.to_parquet('test.parquet'); y_train.to_parquet('y_train.parquet'); y_test.to_parquet('y_test.parquet')

In [5]:
outcomes_train = label_reasons_column(y_train);  outcomes_test  = label_reasons_column(y_test)

y_train = y_train.copy();  y_test = y_test.copy()
y_train.loc[:, 'OUTCOME_MERGED'] = outcomes_train;  y_test.loc[:, 'OUTCOME_MERGED'] = outcomes_test

## Choose a model

In [9]:
X_train.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

model_trial = ModelsSearchEngine(X_train=X_train,y_train=y_train['OUTCOME_MERGED'],
                            X_test=X_test,y_test=y_test['OUTCOME_MERGED'])

model_trial.train_models()

[LightGBM] [Info] Total Bins 8530
[LightGBM] [Info] Number of data points in the train set: 256043, number of used features: 74
[LightGBM] [Info] Start training from score -0.951246
[LightGBM] [Info] Start training from score -0.676750
[LightGBM] [Info] Start training from score -2.249286


LightGBM, Decision Tree, SGD and Neural Network are trained on dataset.


In [10]:
result_analysis = model_trial.evaluate_models()
result_analysis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Decision Tree': {'Accuracy': 0.63,
  'Precision': 0.64,
  'Recall': 0.63,
  'F1 Score': 0.63},
 'LightGBM': {'Accuracy': 0.68,
  'Precision': 0.68,
  'Recall': 0.68,
  'F1 Score': 0.68},
 'SGD Classifier': {'Accuracy': 0.61,
  'Precision': 0.57,
  'Recall': 0.61,
  'F1 Score': 0.59},
 'XGBoost': {'Accuracy': 0.69,
  'Precision': 0.69,
  'Recall': 0.69,
  'F1 Score': 0.69},
 'Neural Network': {'Accuracy': 0.61,
  'Precision': 0.56,
  'Recall': 0.61,
  'F1 Score': 0.59}}

## Tune XGB

In [31]:
param_grid = {
    'n_estimators': [100,150,200],
    'max_depth': [9,11],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'subsample': [ 0.9, 1.0]     }

param_grid = {
    'n_estimators': [150,200],
    'max_depth': [11,13,15,18],
    'learning_rate': [0.1, 0.2, 0.3],
    'subsample': [ 0.9]   }

trainer = XGBoostTuning(X_train, y_train['OUTCOME_MERGED'], X_test, y_test['OUTCOME_MERGED'])
best_model, best_params, best_score = trainer.train_and_evaluate(param_grid)

trainer.save_results(file_path='drafts/Reports/2024_09_11/xgboost_parameters.xlsx')

Trying parameters: {'n_estimators': 150, 'max_depth': 11, 'learning_rate': 0.1, 'subsample': 0.9} | F1 Score: 0.7010
Trying parameters: {'n_estimators': 150, 'max_depth': 11, 'learning_rate': 0.2, 'subsample': 0.9} | F1 Score: 0.7025
Trying parameters: {'n_estimators': 150, 'max_depth': 11, 'learning_rate': 0.3, 'subsample': 0.9} | F1 Score: 0.7025
Trying parameters: {'n_estimators': 150, 'max_depth': 13, 'learning_rate': 0.1, 'subsample': 0.9} | F1 Score: 0.7051
Trying parameters: {'n_estimators': 150, 'max_depth': 13, 'learning_rate': 0.2, 'subsample': 0.9} | F1 Score: 0.7053
Trying parameters: {'n_estimators': 150, 'max_depth': 13, 'learning_rate': 0.3, 'subsample': 0.9} | F1 Score: 0.7035
Trying parameters: {'n_estimators': 150, 'max_depth': 15, 'learning_rate': 0.1, 'subsample': 0.9} | F1 Score: 0.7059
Trying parameters: {'n_estimators': 150, 'max_depth': 15, 'learning_rate': 0.2, 'subsample': 0.9} | F1 Score: 0.7062
Trying parameters: {'n_estimators': 150, 'max_depth': 15, 'learn

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x0000020525569310>>
Traceback (most recent call last):
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\core.py", line 641, in _next_wrapper
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "E:\Projects\Claims_Deployment\venv\lib\site-packages\xgboost\core.py",

## Train XGB

In [6]:
model_runner = ModelApplication(X_train= X_train,y_train= y_train['OUTCOME_MERGED'],
                                X_test= X_test,y_test= y_test['OUTCOME_MERGED'])

model_runner.get_model(force_retrain=True)
model_runner.evaluate_model()

Training is done
Multi-Class Classifier metrics calculation


{'XGBoost': {'Accuracy': 0.68,
  'Precision': 0.68,
  'Recall': 0.68,
  'F1 Score': 0.68}}

In [8]:
from sklearn.metrics import classification_report

PREDS = list(model_runner.model_predict(X_test))
REPORT = classification_report(PREDS, y_test['OUTCOME_MERGED'])
print(REPORT)

              precision    recall  f1-score   support

           0       0.63      0.74      0.68     24319
           1       0.79      0.67      0.73     35707
           2       0.31      0.40      0.35      3985

    accuracy                           0.68     64011
   macro avg       0.58      0.61      0.59     64011
weighted avg       0.70      0.68      0.69     64011



In [9]:
def get_output_test_set(df_test):
    apply_dict = {0:'Rejected-Generally',1:"Approved",2:"Rejected-Medically"}
    PREDS = list(model_runner.model_predict(X_test))
    PREDS_DECODED = [apply_dict.get(item, item) for item in PREDS]

    last_three_columns = ['OUTCOME', 'NPHIES_CODE', 'NPHIES_LABEL']
    other_columns = [col for col in df_test.columns if col not in last_three_columns]
    df_test_preds = df_test[other_columns + last_three_columns]
    df_test_preds['PREDICTIONS'] = PREDS_DECODED
    return df_test_preds

df_test_preds = get_output_test_set(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_preds['PREDICTIONS'] = PREDS_DECODED


In [40]:
df_test_preds

Unnamed: 0,REQUEST_DATE,VISIT_NO,VISIT_DATE,EMERGENCY_INDICATOR,PROVIDER_DEPARTMENT_CODE,PROVIDER_DEPARTMENT,DOCTOR_SPECIALTY_CODE,DOCTOR_CODE,PATIENT_NO,EPISODE_NO,...,DISCOUNT_PERCENTAGE,NOTES,APPROVED_QUNATITY,ICD10,Diagnosis_Key,Chief_Complaint,OUTCOME,NPHIES_CODE,NPHIES_LABEL,PREDICTIONS
365589,2024-06-02 21:59:23,1149844-2,2024-05-29,N,21,ORTHOPAEDIC SERVICE - العظام,22.05,4946,997446,2,...,0.00,-,1.0,M75.0,110209631,left shoulder pain no truma 1w over use during...,APPROVED,,0,Approved
305745,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,6.99,- BE-1-7,0.0,B07,10160324,,REJECTED,BE-1-7,0,Approved
305744,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,30.00,-,1.0,B07,10160324,,APPROVED,,0,Approved
307086,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,0.00,- CV-4-7,0.0,B07,10160324,,REJECTED,,0,Approved
307088,2024-06-02 21:59:29,1302668-2,2024-05-18,N,5,DERMATOLOGY & VENEREOLOGY SERV - جلدية,3.00,9501,1142676,2,...,0.00,- CV-4-7,0.0,B07,10160324,,REJECTED,,0,Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352722,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,4.99,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352721,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,5.00,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352720,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,5.04,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Approved
352719,2024-07-08 17:09:02,1300863-11,2024-06-13,Y,39,EMERGENCY ROOM SERVICES - طوارى,4.00,7060,1140965,11,...,20.00,Partial Approved SE-1-6,0.0,J06.9,10164354,,REJECTED,SE-1-6,0,Rejected-Generally


In [16]:
df_test.to_csv('predictions.csv',index=False)