In [1]:
import pandas as pd

# Set up directory paths for using Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# base_dir = "/content/drive/MyDrive/ed/"

# # Load .csv files
# edstays = pd.read_csv('/content/drive/MyDrive/ed/edstays.csv')
# diagnosis = pd.read_csv('/content/drive/MyDrive/ed/diagnosis.csv')
# triage = pd.read_csv('/content/drive/MyDrive/ed/triage.csv')
# vitalsign = pd.read_csv('/content/drive/MyDrive/ed/vitalsign.csv')
# medrecon = pd.read_csv('/content/drive/MyDrive/ed/medrecon.csv')
# # pyxis = pd.read_csv('/content/drive/MyDrive/ed/pyxis.csv')


edstays = pd.read_csv('edstays.csv')
diagnosis = pd.read_csv('diagnosis.csv')
triage = pd.read_csv('triage.csv')
vitalsign = pd.read_csv('vitalsign.csv')
medrecon = pd.read_csv('medrecon.csv')
# pyxis = pd.read_csv('/content/drive/MyDrive/ed/pyxis.csv')

In [2]:
# Handling missing values

import numpy as np
from sklearn.impute import SimpleImputer

data = pd.merge(edstays, diagnosis, on=['subject_id', 'stay_id'])
data = pd.merge(data, triage, on=['subject_id', 'stay_id'])
data = pd.merge(data, vitalsign, on=['subject_id', 'stay_id'])
data = pd.merge(data, medrecon, on=['subject_id', 'stay_id'])

# Handling missing values
# Using mean for numerical features and the most frequent value for categorical features
num_imputer = SimpleImputer(strategy='mean') # Define the imputer for numerical columns

# Define the numerical columns with missing values
numerical_cols = ['temperature_x', 'heartrate_x', 'resprate_x', 'o2sat_x', 'sbp_x', 'dbp_x', 'pain_x',
                  'temperature_y', 'heartrate_y', 'resprate_y', 'o2sat_y', 'sbp_y', 'dbp_y', 'pain_y', 'acuity']

# Replace 'UA', 'Critical', 'does not scale', 'denies', and 'uncooperative' in 'pain_x' and 'pain_y' with -1
data['pain_x'] = data['pain_x'].replace(['UA', 'Critical', 'does not scale', 'denies', 'uncooperative'], -1)
data['pain_y'] = data['pain_y'].replace(['UA', 'does not scale', 'denies', 'uncooperative'], -1)

# data['pain_x'] = pd.to_numeric(data['pain_x'], errors='coerce').fillna(pd.to_numeric(data['pain_x'], errors='coerce').mean())
# data['pain_y'] = pd.to_numeric(data['pain_y'], errors='coerce').fillna(pd.to_numeric(data['pain_y'], errors='coerce').mean())


# Fill missing values in numerical columns with their mean
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

# # Fill missing values in 'hadm_id' with -1
# data['hadm_id'] = data['hadm_id'].fillna(-1)

# Fill missing values in 'hadm_id' with 0
data['hadm_id'] = data['hadm_id'].fillna(0)

data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])
# data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

# Fill missing values in 'acuity' with its mode
data['acuity'] = data['acuity'].fillna(data['acuity'].mode()[0])

# Fill missing values in 'chiefcomplaint' with 'missing'
data['chiefcomplaint'] = data['chiefcomplaint'].fillna('missing')

# Fill missing values in 'rhythm' with 'missing'
data['rhythm'] = data['rhythm'].fillna('missing')

# # Fill missing values in 'rhythm' with its mode
# data['rhythm'] = data['rhythm'].fillna(data['rhythm'].mode()[0])

In [3]:
import numpy as np

import scipy

# print(scipy.__version__)
# from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2

In [4]:
# Preprocess text data
# text_cols = ['chiefcomplaint', 'icd_title','rhythm', 'name', 'race', 'arrival_transport', 'disposition']
text_cols = ['chiefcomplaint','rhythm', 'name', 'race', 'arrival_transport', 'disposition']
tfidf_vectorizer = TfidfVectorizer()
text_data = data[text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
text_features = tfidf_vectorizer.fit_transform(text_data)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

text_cols = ['chiefcomplaint','rhythm', 'name', 'race', 'arrival_transport', 'disposition']
tfidf_vectorizer = TfidfVectorizer()
text_data = data[text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)

count_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Consider n-grams
text_counts = count_vectorizer.fit_transform(text_data)
tfidf_transformer = TfidfTransformer()
text_features = tfidf_transformer.fit_transform(text_counts)

In [6]:
# Encode categorical data
categorical_cols = ['icd_code']
label_encoder = LabelEncoder()
data[categorical_cols] = data[categorical_cols].apply(lambda x: label_encoder.fit_transform(x))

# One-hot encode remaining categorical columns
categorical_cols = ['gender', 'race', 'arrival_transport', 'disposition']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
categorical_data = one_hot_encoder.fit_transform(data[categorical_cols])

In [7]:
# Extract date-time features
data['intime_hour'] = pd.to_datetime(data['intime']).dt.hour
data['intime_day'] = pd.to_datetime(data['intime']).dt.day
data['intime_month'] = pd.to_datetime(data['intime']).dt.month
data['intime_year'] = pd.to_datetime(data['intime']).dt.year

In [8]:
from scipy.sparse import hstack

# Convert DataFrames to sparse matrices
numerical_data = scipy.sparse.csr_matrix(data[numerical_cols].values)
datetime_data = scipy.sparse.csr_matrix(data[['intime_hour', 'intime_day', 'intime_month', 'intime_year']].values)

# Stack all features
# X = hstack([numerical_data, text_features, categorical_data, datetime_data])
X = hstack([numerical_data, text_features, categorical_data])

In [9]:
# Prepare target variable
data = data.explode('icd_code')
data['diagnosis'] = data.apply(lambda x: str(x['icd_code']) + ' ' + str(x['icd_title']), axis=1)
y = data['diagnosis']

In [10]:
# # Perform feature selection using chi-squared test
# selector = SelectKBest(chi2, k=10)
# X_selected = selector.fit_transform(X, y)

In [11]:
# Split data into train and test sets
# Split the data into training and a temporary set using an 80/20 split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary set into validation and test sets using a 50/50 split
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


y_train = y_train.values.reshape(-1, 1)

In [16]:
# Class Imbalance Handling
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Ensemble Method
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_idx, val_idx in skf.split(X_resampled, y_resampled):
    X_train_cv, X_val_cv = X_resampled[train_idx], X_resampled[val_idx]
    y_train_cv, y_val_cv = y_resampled[train_idx], y_resampled[val_idx]

    model.fit(X_train_cv, y_train_cv)
    y_pred = model.predict(X_val_cv)

    f1 = f1_score(y_val_cv, y_pred, average='macro')
    acc = accuracy_score(y_val_cv, y_pred)
    precision = precision_score(y_val_cv, y_pred, average='macro')
    recall = recall_score(y_val_cv, y_pred, average='macro')
    
    scores.append((f1, acc, precision, recall))

print(f"Average F1-score: {np.mean([score[0] for score in scores])}")
print(f"Average Accuracy: {np.mean([score[1] for score in scores])}")
print(f"Average Precision: {np.mean([score[2] for score in scores])}")
print(f"Average Recall: {np.mean([score[3] for score in scores])}")


Average F1-score: 0.7482326286599884
Average Accuracy: 0.7614920670777658
Average Precision: 0.7682211381138798
Average Recall: 0.7614920301656758


IndexError: tuple index out of range

In [None]:
# Save the trained model
import pickle
pickle.dump(model, open('diagnosis_prediction_model.pkl', 'wb'))