In [1]:
import numpy as np
from src.lstm_encoder import LSTMEmbedding
from cashe import get_train_test_split, get_training_inputs, drop_nomodel_columns
from src.model_app import ModelApplication
from src.utilities_rejection import RejectionReasonLabeling
from src.data_local import MergedDataPreprocessing

df_train, df_test = get_train_test_split(path='data/HJH/13-06-2024')
train_rej = RejectionReasonLabeling(df_train)
train_rej.recoginze_label()

train_rej = RejectionReasonLabeling(df_test)
train_rej.recoginze_label()
df_train['NPHIES_LABEL'] = np.where(df_train['NPHIES_CODE'] == 'MN-1-1', 1, 0)
df_test['NPHIES_LABEL'] = np.where(df_test['NPHIES_CODE'] == 'MN-1-1', 1, 0)

print('data is labeled with medical reason')
X_train, y_train, X_test, y_test = get_training_inputs(df_train, df_test)
y_train['NPHIES_LABEL'] = df_train.NPHIES_LABEL; y_test['NPHIES_LABEL'] = df_test.NPHIES_LABEL
X_train.drop(columns=['NPHIES_CODE'],inplace=True); X_test.drop(columns=['NPHIES_CODE'],inplace=True)

print('data is loaded successfully')
preprocessing_train = MergedDataPreprocessing(X_train)
X_train_prep = preprocessing_train.columns_prep()

preprocessing_test = MergedDataPreprocessing(X_test)
X_test_prep = preprocessing_test.columns_prep()
print('data is preprocessed numerically')

data is labeled with medical reason


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['NPHIES_LABEL'] = df_train.NPHIES_LABEL; y_test['NPHIES_LABEL'] = df_test.NPHIES_LABEL


data is loaded successfully
data is preprocessed numerically


In [3]:
lstm_embedding = LSTMEmbedding()

X_train_encoded = preprocessing_train.column_embedding(X_train_prep, is_service=True)
X_test_encoded = preprocessing_train.column_embedding(X_test_prep, is_service=True)

X_train_encoded = preprocessing_train.column_embedding(X_train_encoded, is_service=False)
X_test_encoded = preprocessing_train.column_embedding(X_test_encoded, is_service=False)

X_train = drop_nomodel_columns(X_train_encoded)
X_test  = drop_nomodel_columns(X_test_encoded)
print('data is encoded')

data is encoded


In [4]:
## save data (optional)
# X_train_encoded.to_csv('train_encoded.csv',index=False); X_test_encoded.to_csv('test_encoded.csv',index=False)
# y_train[['OUTCOME','NPHIES_LABEL']].to_csv('y_train.csv',index=False); y_test[['OUTCOME','NPHIES_LABEL']].to_csv('y_test.csv',index=False)

outcomes_train = []; outcomes_test = []
for i in range(len(y_train)):
    if y_train['NPHIES_LABEL'].iloc[i] == 1:
        outcomes_train.append(2)
    else:
        outcomes_train.append(y_train['OUTCOME'].iloc[i])

for i in range(len(y_test)):
    if y_test['NPHIES_LABEL'].iloc[i] == 1:
        outcomes_test.append(2)
    else:
        outcomes_test.append(y_test['OUTCOME'].iloc[i])

y_train['OUTCOME_MERGED'] = outcomes_train
y_test['OUTCOME_MERGED']  = outcomes_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['OUTCOME_MERGED'] = outcomes_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['OUTCOME_MERGED']  = outcomes_test


In [5]:
model_runner = ModelApplication(X_train= X_train,y_train= y_train['OUTCOME_MERGED'],X_test= X_test,y_test= y_test['OUTCOME_MERGED'])
model_runner.get_model(force_retrain=True,model_type='outcome')
model_runner.evaluate_model()
## results are calculated with weighted pre, rec

Training is done
Multi-Class Classifier metrics calculation


{'XGBoost': {'Accuracy': 0.73,
  'Precision': 0.73,
  'Recall': 0.73,
  'F1 Score': 0.73}}

In [6]:
## try other models
from src.model_train import ModelTrainer

X_train.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

model_trial = ModelTrainer(X_train=X_train,y_train=y_train['OUTCOME_MERGED'],
                            X_test=X_test,y_test=y_test['OUTCOME_MERGED'])

model_trial.train_models()

[LightGBM] [Info] Total Bins 6921
[LightGBM] [Info] Number of data points in the train set: 165580, number of used features: 63
[LightGBM] [Info] Start training from score -1.022034
[LightGBM] [Info] Start training from score -0.625413
[LightGBM] [Info] Start training from score -2.252869






LightGBM, Decision Tree, SGD and Neural Network are trained on dataset.


In [7]:
result_analysis = model_trial.evaluate_models()
result_analysis

{'Decision Tree': {'Accuracy': 0.65,
  'Precision': 0.65,
  'Recall': 0.65,
  'F1 Score': 0.65},
 'LightGBM': {'Accuracy': 0.72,
  'Precision': 0.73,
  'Recall': 0.72,
  'F1 Score': 0.72},
 'SGD Classifier': {'Accuracy': 0.46,
  'Precision': 0.44,
  'Recall': 0.46,
  'F1 Score': 0.45},
 'XGBoost': {'Accuracy': 0.74,
  'Precision': 0.74,
  'Recall': 0.74,
  'F1 Score': 0.74},
 'Neural Network': {'Accuracy': 0.54,
  'Precision': 0.51,
  'Recall': 0.54,
  'F1 Score': 0.52}}

In [13]:
apply_dict = {0:'Rejected-Generally',1:"Approved",2:"Rejected-Medically"}
PREDS = list(model_runner.model_predict(X_test))
PREDS_DECODED = [apply_dict.get(item, item) for item in PREDS]
PREDS_DECODED[:3]

['Rejected-Generally', 'Approved', 'Approved']

In [14]:
last_three_columns = ['OUTCOME', 'NPHIES_CODE', 'NPHIES_LABEL']
other_columns = [col for col in df_test.columns if col not in last_three_columns]
df_test = df_test[other_columns + last_three_columns]
df_test['PREDICTIONS'] = PREDS_DECODED

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['PREDICTIONS'] = PREDS_DECODED


In [16]:
df_test.to_csv('predictions.csv',index=False)