In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack

In [None]:
# Load the dataset
file_path = 'sepsis_diab_pt_all_v2.xlsx'  # Update if needed
# Load all sheets
sheets = pd.ExcelFile(file_path)
sheet_names = sheets.sheet_names
print("Available Sheets:", sheet_names)

# Load individual sheets
admission_data = sheets.parse('sepsis_pt_all_admission details')
lab_events = sheets.parse('sepsis_lab_events')
microbiology_events = sheets.parse('microbiology events')
prescription_data = sheets.parse('prescriptoin')
#poe_data = sheets.parse('poe')
#poe_details = sheets.parse('poe_detail')

In [None]:
# Select relevant columns
admission_data = admission_data[['subject_id','hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours','heartdisease_flag','kidneydisease_flag']].drop_duplicates()
prescription_data = prescription_data[['subject_id','hadm_id', 'drug']].drop_duplicates()

prescription_data=prescription_data.reset_index()

# Drop rows with missing values in critical columns
admission_data.dropna(subset=['subject_id','hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours','heartdisease_flag','kidneydisease_flag'], inplace=True)
prescription_data.dropna(subset=['subject_id','hadm_id', 'drug'], inplace=True)

# Merge admissions and prescriptions
admission_drug_data = pd.merge(prescription_data, admission_data, on='hadm_id', how='inner')
admission_drug_data.rename(columns={'subject_id_x': 'subject_id'}, inplace=True)

# Encode categorical variables
categorical_features = pd.get_dummies(
    admission_drug_data[['admission_type', 'drg_code', 'dx_1_code','heartdisease_flag','kidneydisease_flag']],
    drop_first=True
)

# Scale numerical features (EDHOURS)
scaler = StandardScaler()
numerical_features = scaler.fit_transform(admission_drug_data[['edhours']])

numerical_features = pd.DataFrame(numerical_features, columns=['scaled_edhours'])

# TF-IDF for drug names
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
drug_tfidf_matrix = tfidf_vectorizer.fit_transform(admission_drug_data['drug'])

# Combine All Features into a Single DataFrame
combined_features = pd.concat([categorical_features.reset_index(drop=True), numerical_features.reset_index(drop=True)], axis=1).reset_index(drop=True)

print("Combined Features Shape:", combined_features.shape)

In [None]:
admission_drug_data.head()

In [None]:
### **Step 1: Data Preparation**
def prepare_data(admission_drug_data, min_drug_freq=5, min_patient_drugs=3):
    """
    Prepares data by filtering rare drugs, handling missing values, and creating an interaction matrix.

    Args:
        admission_drug_data (pd.DataFrame): Data containing 'subject_id', 'hadm_id', and 'drug'.
        min_drug_freq (int): Minimum number of times a drug must be prescribed to be included.
        min_patient_drugs (int): Minimum drugs a patient must have for inclusion.

    Returns:
        pd.DataFrame: Processed patient-drug interaction matrix.
    """
    # Remove duplicates & handle missing values
    admission_drug_data = admission_drug_data.drop_duplicates().dropna(subset=['subject_id', 'drug'])

    # Remove rarely prescribed drugs
    drug_counts = admission_drug_data['drug'].value_counts()
    rare_drugs = drug_counts[drug_counts < min_drug_freq].index
    admission_drug_data = admission_drug_data[~admission_drug_data['drug'].isin(rare_drugs)]

    # Aggregate multiple admissions per patient
    patient_drug_data = admission_drug_data.groupby(['subject_id', 'drug']).size().reset_index(name='count')

    # Remove patients with very few prescriptions
    patient_counts = patient_drug_data['subject_id'].value_counts()
    patient_drug_data = patient_drug_data[patient_drug_data['subject_id'].isin(patient_counts[patient_counts >= min_patient_drugs].index)]
    
    # Create interaction matrix (subject_id × drug)
    interaction_matrix = patient_drug_data.pivot_table(index="subject_id", columns="drug", values="count", fill_value=0)

    return interaction_matrix


In [None]:
### **Step 2: Train SVD Model**
def train_svd(interaction_matrix, n_components=50):
    """
    Applies SVD for dimensionality reduction.

    Args:
        interaction_matrix (pd.DataFrame): Patient-drug interaction matrix.
        n_components (int): Number of latent features.

    Returns:
        tuple: (SVD model, latent feature matrix).
    """
    sparse_matrix = csr_matrix(interaction_matrix)  # Convert to sparse for efficiency
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    latent_matrix = svd.fit_transform(sparse_matrix)

    return svd, latent_matrix


In [None]:
def recommend_drugs_old(patient_id, interaction_matrix, latent_matrix, top_n=5):
    if patient_id not in interaction_matrix.index:
        return "Patient ID not found in dataset."

    patient_index = interaction_matrix.index.get_loc(patient_id)
    similarity_scores = np.dot(latent_matrix, latent_matrix[patient_index])

    print(f"Patient Index: {patient_index}")
    print(f"Similarity Scores: {similarity_scores[:10]}")  # Print first 10 similarity scores

    # Retrieve similar patient indices
    similar_patient_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]  # Exclude the patient itself

    # Check if similar patients exist
    if len(similar_patient_indices) == 0:
        print("No similar patients found!")
        return None

    # Aggregate drug usage from similar patients
    similar_patients = interaction_matrix.iloc[similar_patient_indices]
    recommended_drugs = similar_patients.mean(axis=0).sort_values(ascending=False).head(top_n)

    if recommended_drugs.empty:
        print("No recommended drugs found!")
        return None

    return recommended_drugs

def recommend_drugs(patient_id, interaction_matrix, latent_matrix, top_n=5):
    if patient_id not in interaction_matrix.index:
        return "Patient ID not found in dataset."

    # Get the patient index
    patient_index = interaction_matrix.index.get_loc(patient_id)

    # Compute similarity scores (dot product of latent vectors)
    similarity_scores = np.dot(latent_matrix, latent_matrix[patient_index])

    print(f"Patient Index: {patient_index}")
    print(f"Similarity Scores (first 10): {similarity_scores[:10]}")  # Debugging output

    # Get indices of most similar patients (excluding the patient itself)
    similar_patient_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]

    # Get corresponding similarity scores
    similar_patient_scores = similarity_scores[similar_patient_indices]

    if len(similar_patient_indices) == 0:
        print("No similar patients found!")
        return None

    # Aggregate drug usage from similar patients
    similar_patients = interaction_matrix.iloc[similar_patient_indices]
    recommended_drugs = similar_patients.mean(axis=0).sort_values(ascending=False).head(top_n)

    if recommended_drugs.empty:
        print("No recommended drugs found!")
        return None

    # Create a dictionary to store recommendations + similarity scores
    result = {
        "recommended_drugs": recommended_drugs,
        "similar_patients": {
            "indices": similar_patient_indices.tolist(),
            "scores": similar_patient_scores.tolist()
        }
    }

    return result

In [None]:
### **Step 4: Running the Pipeline**
# Load dataset (Replace with actual dataset)
# admission_drug_data = pd.read_csv("your_data.csv")

# Data Preparation
interaction_matrix = prepare_data(admission_drug_data)
#print(interaction_matrix)

# Train SVD Model
svd_model, latent_matrix = train_svd(interaction_matrix, n_components=50)
#print(svd_model)
#print(latent_matrix)

# Get Recommendations for a Sample Patient
patient_id = 10577647  # Replace with actual `subject_id`
recommendations = recommend_drugs(patient_id, interaction_matrix, latent_matrix, top_n=10)
print(recommendations)

In [None]:
#Events Recommendations
### **Step 1: Data Preparation**
def prepare_event_data(admission_event_data, min_event_freq=5, min_patient_events=3):
    """
    Prepares data by filtering rare events, handling missing values, and creating an interaction matrix.

    Args:
        admission_event_data (pd.DataFrame): Data containing 'subject_id', 'hadm_id', and 'event'.
        min_event_freq (int): Minimum number of times an event must be recorded to be included.
        min_patient_events (int): Minimum events a patient must have for inclusion.

    Returns:
        pd.DataFrame: Processed patient-event interaction matrix.
    """
    # Remove duplicates & handle missing values
    admission_event_data = admission_event_data.drop_duplicates().dropna(subset=['subject_id', 'spec_type_desc'])

    # Remove rare events
    event_counts = admission_event_data['spec_type_desc'].value_counts()
    rare_events = event_counts[event_counts < min_event_freq].index
    admission_event_data = admission_event_data[~admission_event_data['spec_type_desc'].isin(rare_events)]

    # Aggregate multiple admissions per patient
    patient_event_data = admission_event_data.groupby(['subject_id', 'spec_type_desc']).size().reset_index(name='count')

    # Remove patients with very few recorded events
    patient_counts = patient_event_data['subject_id'].value_counts()
    patient_event_data = patient_event_data[patient_event_data['subject_id'].isin(patient_counts[patient_counts >= min_patient_events].index)]
    
    # Create interaction matrix (subject_id × event)
    interaction_matrix = patient_event_data.pivot_table(index="subject_id", columns="spec_type_desc", values="count", fill_value=0)

    return interaction_matrix


### **Step 2: Train SVD Model**
def train_svd(interaction_matrix, n_components=30):
    """
    Applies SVD for dimensionality reduction.

    Args:
        interaction_matrix (pd.DataFrame): Patient-event interaction matrix.
        n_components (int): Number of latent features.

    Returns:
        tuple: (SVD model, latent feature matrix).
    """
    sparse_matrix = csr_matrix(interaction_matrix)  # Convert to sparse for efficiency
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    latent_matrix = svd.fit_transform(sparse_matrix)

    return svd, latent_matrix


### **Step 3: Event Recommendation**
def recommend_events(patient_id, interaction_matrix, latent_matrix, top_n=5):
    """
    Recommends events for a given patient based on similar patients.

    Args:
        patient_id (int): Patient ID for whom to recommend events.
        interaction_matrix (pd.DataFrame): Patient-event interaction matrix.
        latent_matrix (np.ndarray): Latent feature matrix from SVD.
        top_n (int): Number of recommendations.

    Returns:
        dict: Recommended events and similar patients.
    """
    if patient_id not in interaction_matrix.index:
        return "Patient ID not found in dataset."

    # Get the patient index
    patient_index = interaction_matrix.index.get_loc(patient_id)

    # Compute similarity scores (dot product of latent vectors)
    similarity_scores = np.dot(latent_matrix, latent_matrix[patient_index])

    print(f"Patient Index: {patient_index}")
    print(f"Similarity Scores (first 10): {similarity_scores[:10]}")  # Debugging output

    # Get indices of most similar patients (excluding the patient itself)
    similar_patient_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]

    # Get corresponding similarity scores
    similar_patient_scores = similarity_scores[similar_patient_indices]

    if len(similar_patient_indices) == 0:
        print("No similar patients found!")
        return None

    # Aggregate event occurrences from similar patients
    similar_patients = interaction_matrix.iloc[similar_patient_indices]
    recommended_events = similar_patients.mean(axis=0).sort_values(ascending=False).head(top_n)

    if recommended_events.empty:
        print("No recommended events found!")
        return None

    # Create a dictionary to store recommendations + similarity scores
    result = {
        "recommended_events": recommended_events,
        "similar_patients": {
            "indices": similar_patient_indices.tolist(),
            "scores": similar_patient_scores.tolist()
        }
    }

    return result

In [None]:
# Step 1: Prepare Data
interaction_matrix = prepare_event_data(microbiology_events)

In [None]:
interaction_matrix.head()

In [None]:

# Step 2: Train SVD Model
svd_model, latent_matrix = train_svd(interaction_matrix)


In [None]:
# Get Recommendations for a Sample Patient
patient_id = 10577647  # Replace with actual `subject_id`
recommendations = recommend_events(patient_id, interaction_matrix, latent_matrix, top_n=10)
print(recommendations)