In [2]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_excel('Dataset/DSU-Dataset.xlsx')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253686 entries, 0 to 253685
Data columns (total 31 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Claim ID            253686 non-null  object        
 1   Service Date        253686 non-null  datetime64[ns]
 2   Recieved Date       253686 non-null  datetime64[ns]
 3   Paid Date           253686 non-null  datetime64[ns]
 4   Patient ID          253686 non-null  object        
 5   Member Age          253686 non-null  object        
 6   Gender              253686 non-null  object        
 7   Marital Status      241000 non-null  object        
 8   Ethnicity           226800 non-null  object        
 9   LOB                 253686 non-null  object        
 10  Network Status      253686 non-null  object        
 11  Claim Category      253685 non-null  object        
 12  Claim Subcategory   253685 non-null  object        
 13  Claim Line          253686 no

In [16]:
# removes instances of null high cost claims
data = data[(data['High Cost Claim'] == 0) | (data['High Cost Claim'] == 1)]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188552 entries, 2 to 253685
Data columns (total 31 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Claim ID            188552 non-null  object        
 1   Service Date        188552 non-null  datetime64[ns]
 2   Recieved Date       188552 non-null  datetime64[ns]
 3   Paid Date           188552 non-null  datetime64[ns]
 4   Patient ID          188552 non-null  object        
 5   Member Age          188552 non-null  object        
 6   Gender              188552 non-null  object        
 7   Marital Status      179185 non-null  object        
 8   Ethnicity           168949 non-null  object        
 9   LOB                 188552 non-null  object        
 10  Network Status      188552 non-null  object        
 11  Claim Category      188551 non-null  object        
 12  Claim Subcategory   188551 non-null  object        
 13  Claim Line          188552 no

In [38]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [34]:
# Feature Selection
selected_features = [
    "Service Date", "Recieved Date", "Paid Date", "Member Age", "Gender", "LOB", 
    "Network Status", "Claim Category", "Place of Service", "Provider Type", 
    "Provider Specialty", "ICD10 Code 1", "ICD10 Code 2", "ICD10 Code 3", 
    "Service Type", "Service Code", "High Cost Claim"
]
df = data[selected_features].copy()

In [35]:
# Convert dates to datetime format
df["Service Date"] = pd.to_datetime(df["Service Date"])
df["Recieved Date"] = pd.to_datetime(df["Recieved Date"])
df["Paid Date"] = pd.to_datetime(df["Paid Date"])

# Create new features
df["Processing Delay"] = (df["Paid Date"] - df["Recieved Date"]).dt.days
df["Claim Lag"] = (df["Recieved Date"] - df["Service Date"]).dt.days

# Drop original date columns
df.drop(["Service Date", "Recieved Date", "Paid Date"], axis=1, inplace=True)

# Handle categorical variables
categorical_cols = ["Gender", "LOB", "Network Status", "Claim Category", "Place of Service", "Provider Type", "Provider Specialty", "Service Type", "Service Code", "Member Age"]
df[categorical_cols] = df[categorical_cols].astype(str).apply(LabelEncoder().fit_transform)

# Handle missing values
df.fillna(df.median(), inplace=True)

In [36]:
# Define target and features
X = df.drop("High Cost Claim", axis=1).values
y = df["High Cost Claim"].values  # Binary classification (0 or 1)

# Reshape input for LSTM: (samples, time steps, features)
X = X.reshape((X.shape[0], 1, X.shape[1]))  # 1 time step for now

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [40]:
# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Epoch 1/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0234 - val_accuracy: 1.0000 - val_loss: 1.2765e-06
Epoch 2/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 1.0000 - loss: 1.1421e-06 - val_accuracy: 1.0000 - val_loss: 2.0318e-07
Epoch 3/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 1.0000 - loss: 1.1093e-07 - val_accuracy: 1.0000 - val_loss: 2.5063e-08
Epoch 4/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 1.0000 - loss: 9.2144e-09 - val_accuracy: 1.0000 - val_loss: 3.0067e-09
Epoch 5/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 1.0000 - loss: 1.3179e-09 - val_accuracy: 1.0000 - val_loss: 7.6395e-10
Epoch 6/20
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 1.0000 - loss: 2.9904e-10 - val_accuracy: