In [None]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

In [None]:
# Download and read the medical transcriptions dataset from Kaggle
# Note: Requires kaggle API credentials to be set up
!kaggle datasets download -d tboyle10/medicaltranscriptions -p temp_data
!unzip temp_data/medicaltranscriptions.zip -d temp_data

# Read the CSV file
medical_df = pd.read_csv('temp_data/mtsamples.csv')
print(f"Loaded medical transcriptions dataset with {len(medical_df)} records")


In [None]:
# Clean the medical transcriptions dataset:
nlp = spacy.load('en_core_web_sm')

def clean_text(text:str) -> str:
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    text = ' '.join(token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha)
    return text

medical_df = medical_df[~medical_df['transcription'].isnull()]
medical_df['cleaned_transcription'] = medical_df['transcription'].apply(clean_text)

In [None]:
df_out = medical_df[~medical_df['transcription'].isnull()]
print(df_out.shape)
df_out.head()

In [None]:
# Remove any rows where there is a missing column
df_out = df_out.dropna(axis=0)
print(df_out.shape)
df_out.head()


In [None]:
import os

if not os.path.exists('../temp_data/medical_transcriptions'):
    os.makedirs('../temp_data/medical_transcriptions')

df_out.iloc[:300].to_csv('../temp_data/medical_transcriptions/mt_samples_300.csv', index=False)

In [None]:
pd.set_option('display.max_rows', 20)
df_out.head(20)

## Analysis of Labelled Dataset

We now analyse the labelled medication transcriptions dataset. Potential confounding factors in this dataset:

- The label may depend upon the descriptiveness of the doctor writing the transcription e.g. some doctors may write the full risks of a standard procedure whilst others might keep it the same

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the labelled dataset
labelled_df = pd.read_csv('../temp_data/medical_transcriptions/mt_samples_300_labeled.csv')

# Display the first few rows of the dataset
labelled_df.head()

In [None]:
plt.bar(labelled_df['labels'].value_counts().index, labelled_df['labels'].value_counts().values)
plt.title('Distribution of High, Low and Ambiguous Stakes in Labelled Dataset')