In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import pandas as pd

In [None]:
# Load the dataset
file_path = 'sepsis_diab_pt_all_v2.xlsx'  # Update if needed
# Load all sheets
sheets = pd.ExcelFile(file_path)
sheet_names = sheets.sheet_names
print("Available Sheets:", sheet_names)

In [None]:
# Load individual sheets
admission_data = sheets.parse('sepsis_pt_all_admission details')
lab_events = sheets.parse('sepsis_lab_events')
microbiology_events = sheets.parse('microbiology events')
prescription_data = sheets.parse('prescriptoin')
#poe_data = sheets.parse('poe')
#poe_details = sheets.parse('poe_detail')

In [None]:
admission_data.info(verbose=True)

In [None]:
# Select relevant columns
admission_data = admission_data[['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours']].drop_duplicates()
prescription_data = prescription_data[['hadm_id', 'drug']].drop_duplicates()

prescription_data=prescription_data.reset_index()

# Drop rows with missing values in critical columns
admission_data.dropna(subset=['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours'], inplace=True)
prescription_data.dropna(subset=['hadm_id', 'drug'], inplace=True)

# Merge admissions and prescriptions
admission_drug_data = pd.merge(prescription_data, admission_data, on='hadm_id', how='inner')

# Encode categorical variables
categorical_features = pd.get_dummies(
    admission_drug_data[['admission_type', 'drg_code', 'dx_1_code']],
    drop_first=True
)

# Scale numerical features (EDHOURS)
scaler = StandardScaler()
numerical_features = scaler.fit_transform(admission_drug_data[['edhours']])

numerical_features = pd.DataFrame(numerical_features, columns=['scaled_edhours'])

# TF-IDF for drug names
#tfidf_vectorizer = TfidfVectorizer(stop_words='english')
#drug_tfidf_matrix = tfidf_vectorizer.fit_transform(admission_drug_data['drug'])

# Combine All Features into a Single DataFrame
combined_features = pd.concat([categorical_features.reset_index(drop=True), numerical_features.reset_index(drop=True)], axis=1).reset_index(drop=True)

print("Combined Features Shape:", combined_features.shape)

In [None]:
# Step 1: Create Interaction Matrix
def create_interaction_matrix(admission_data,col):
    """
    Creates an interaction matrix for admissions and spec_type_desc.

    Args:
        admission_events_data (pd.DataFrame): Dataset containing 'hadm_id' and 'spec_type_desc'.

    Returns:
        pd.DataFrame: Interaction matrix with 'hadm_id' as rows and 'spec_type_desc' as columns.
    """
    interaction_matrix = pd.pivot_table(
        admission_data,
        values='admission_type',  # Use any value column; replace with actual interaction measure if available.
        index='hadm_id',
        columns=col,
        aggfunc='count',  # Count occurrences (binary presence).
        fill_value=0
    )
    return interaction_matrix

# Step 2: Calculate Item-Item Similarity
def calculate_item_similarity(interaction_matrix):
    """
    Calculates cosine similarity between items (columns).

    Args:
        interaction_matrix (pd.DataFrame): Interaction matrix with items as columns.

    Returns:
        pd.DataFrame: Item-item similarity matrix.
    """
    similarity_matrix = cosine_similarity(interaction_matrix.T)
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=interaction_matrix.columns,
        columns=interaction_matrix.columns
    )
    return similarity_df

def recommend_items(query_items, similarity_df, top_n=5):
    """
    Recommends items based on similarity scores to query items.

    Args:
        query_items (list): List of items to find similar items for.
        similarity_df (pd.DataFrame): Item-item similarity matrix.
        top_n (int): Number of recommendations to return.

    Returns:
        pd.Series: Top recommended items with similarity scores.
    """
    # Aggregate similarity scores for the query items
    similar_items = similarity_df[query_items].mean(axis=1).sort_values(ascending=False)
    
    # Exclude the query items themselves from recommendations
    similar_items = similar_items[~similar_items.index.isin(query_items)]
    
    return similar_items.head(top_n)


In [None]:
#prescription_data['drug'].value_counts()

In [None]:
admission_drug_data.head(2)

In [None]:
# Example Execution
if __name__ == "__main__":
    interaction_matrix = create_interaction_matrix(admission_drug_data,"drug")
    print("Interaction Matrix Shape:", interaction_matrix.shape)
    
    similarity_df = calculate_item_similarity(interaction_matrix)
    print("Item-Item Similarity Matrix Shape:", similarity_df.shape)
    
    # Example query: Finding recommendations for a given 'spec_type_desc'
    query_items = ['Glucagon']  # Replace with actual items
    recommendations = recommend_items(query_items, similarity_df, top_n=5)
    print("Recommendations:\n", recommendations)


In [None]:
#Recommendation of microbilogy events based on item Based filtering  

In [None]:
# Select relevant columns
admission_data_forevents = admission_data[['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours']].drop_duplicates()
microbiology_events = microbiology_events[['hadm_id', 'spec_type_desc']].drop_duplicates()

microbiology_events=microbiology_events.reset_index()

# Drop rows with missing values in critical columns
admission_data_forevents.dropna(subset=['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours'], inplace=True)
microbiology_events.dropna(subset=['hadm_id', 'spec_type_desc'], inplace=True)

# Merge admissions and prescriptions
admission_events_data = pd.merge(microbiology_events, admission_data_forevents, on='hadm_id', how='inner')

# Encode categorical variables
categorical_features_events = pd.get_dummies(
    admission_data_forevents[['admission_type', 'drg_code', 'dx_1_code']],drop_first=True
)

# Scale numerical features (EDHOURS)
scaler_events = StandardScaler()
numerical_features_events = scaler_events.fit_transform(admission_data_forevents[['edhours']])

numerical_features_events = pd.DataFrame(numerical_features_events, columns=['scaled_edhours'])

# TF-IDF for drug names
#tfidf_vectorizer_events = TfidfVectorizer(stop_words='english')
#events_tfidf_matrix = tfidf_vectorizer_events.fit_transform(admission_events_data['spec_type_desc'])

# Combine All Features into a Single DataFrame
combined_features_events = pd.concat([categorical_features_events.reset_index(drop=True), numerical_features_events.reset_index(drop=True)], axis=1).reset_index(drop=True)

print("Combined Features Shape:", combined_features_events.shape)

In [None]:
admission_events_data.head(2)

In [None]:
# Example Execution
if __name__ == "__main__":
    # Assuming `admission_events_data` is the preprocessed data
    # Replace 'admission_events_data' with your actual dataframe
    interaction_matrix = create_interaction_matrix(admission_events_data,"spec_type_desc")
    print("Interaction Matrix Shape:", interaction_matrix.shape)
    
    similarity_df = calculate_item_similarity(interaction_matrix)
    print("Item-Item Similarity Matrix Shape:", similarity_df.shape)
    
    # Example query: Finding recommendations for a given 'spec_type_desc'
    query_items = ['BLOOD CULTURE']  # Replace with actual items
    recommendations = recommend_items(query_items, similarity_df, top_n=5)
    print("Recommendations:\n", recommendations)