In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
import pickle
import os

# Load data
df = pd.read_csv('../data/mtsamples.csv')

In [6]:
# Clean text and keywords
# Filter out empty rows
df = df.dropna(subset=['transcription', 'keywords'])

def process_keywords(text):
    if pd.isna(text): 
        return []
    raw_list = [k.strip().lower() for k in text.split(',')]
    # Remove junk data (long sentences or empty strings)
    clean_list = [k for k in raw_list if len(k) < 50 and k != '' and "transcribed" not in k]
    return clean_list

df['keywords_list'] = df['keywords'].apply(process_keywords)

print(f"Total Rows: {len(df)}")

Total Rows: 3898


In [7]:
# Filter for Top 50 Classes
all_keywords = [item for sublist in df['keywords_list'] for item in sublist]
keyword_counts = Counter(all_keywords)

# Get top 50 frequent keywords
top_50 = [k for k, v in keyword_counts.most_common(50)]
print(f"Top keywords sample: {top_50[:5]}")

def filter_to_top_50(kw_list):
    return [k for k in kw_list if k in top_50]

df['filtered_keywords'] = df['keywords_list'].apply(filter_to_top_50)

# Remove rows that lost all keywords after filtering
df = df[df['filtered_keywords'].apply(len) > 0]

print(f"Rows after filtering: {len(df)}")

# Create Target Matrix
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['filtered_keywords'])

print(f"Number of Classes: {len(mlb.classes_)}")


Top keywords sample: ['surgery', 'orthopedic', 'cardiovascular / pulmonary', 'radiology', 'consult - history and phy.']
Rows after filtering: 3474
Number of Classes: 50


In [8]:
df.to_csv('../data/mtsamples_cleaned.csv', index=False)

with open('../models/mlb_classes.pkl', 'wb') as f:
    pickle.dump(mlb.classes_, f)

print("Data processing complete. Files saved.")

Data processing complete. Files saved.
