# Medical Domain Identifier using Natural Language Processing

## 1. Data Collection

In [None]:
import kagglehub
import os
import pandas as pd

In [None]:
dataset_path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
df = pd.read_csv(os.path.join(dataset_path, "mtsamples.csv"))

In [None]:
print(df.columns)
print(df.head())

## 2. Data Exploration & Preprocessing

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
def get_sentence_word_count(text_list):
    sent_count = 0
    word_count = 0
    vocab = {}
    for text in text_list:
        sentences = sent_tokenize(str(text).lower())
        sent_count = sent_count + len(sentences)
        for sentence in sentences:
            words = word_tokenize(sentence)
            for word in words:
                if (word in vocab.keys()):
                    vocab[word] = vocab[word] + 1
                else:
                    vocab[word] = 1
    word_count = len(vocab.keys())
    return sent_count, word_count

In [None]:
df = df[df['transcription'].notna()]
sent_count, word_count = get_sentence_word_count(df['transcription'].tolist())

In [None]:
print("Number of sentences in transcriptions column: " + str(sent_count))
print("Number of unique words in transcriptions column: " + str(word_count))

In [None]:
from tabulate import tabulate

In [None]:
df_cat  = df.groupby('medical_specialty')

In [None]:
headers = ["#", "Category", "Frequency"]
data = [[a + 1, b, len(c)] for a, (b, c) in enumerate(df_cat)]

print('Original Categories')
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
df_cat_fil = df_cat.filter(lambda x: x.shape[0] > 50)
df_cat_fin = df_cat_fil.groupby('medical_specialty')

In [None]:
data = [[a + 1, b, len(c)] for a, (b, c) in enumerate(df_cat_fin)]

print('Reduced Categories')
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
import random

In [None]:
headers = ["#", "Category", "Sample Transcription"]
data = [
    [
        a + 1,
        b,
        random.choice(c["transcription"].tolist())
    ]
    for a, (b, c) in enumerate(df_cat_fin)
]

print("Category Transcription Examples")
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='medical_specialty', data=df_cat_fil)
plt.show()

In [None]:
df_fil = df_cat_fil[['transcription', 'medical_specialty']]
df_fil = df_fil.drop(df_fil[df_fil['transcription'].isna()].index)
df_fil.shape

In [None]:
print("Before cleaning text:\n")
print(f'Sample Transcription 1:\n{df_fil.iloc[5]["transcription"]}\n')
print(f'Sample Transcription 2:\n{df_fil.iloc[125]["transcription"]}\n')
print(f'Sample Transcription 3:\n{df_fil.iloc[1000]['transcription']}')

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def clean_and_preprocess_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in stop_words and len(token) > 2]
    
    return ' '.join(tokens)

In [None]:
df_fil['transcription'] = df_fil['transcription'].apply(
    clean_and_preprocess_text
)

In [None]:
print("After cleaning text:\n")
print(f'Sample Transcription 1:\n{df_fil.iloc[5]["transcription"]}\n')
print(f'Sample Transcription 2:\n{df_fil.iloc[125]["transcription"]}\n')
print(f'Sample Transcription 3:\n{df_fil.iloc[1000]['transcription']}')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    stop_words='english',
    ngram_range=(1,3),
    max_df=0.75, 
    use_idf=True, 
    smooth_idf=True, 
    max_features=1000
)
tf_idf_mat  = vectorizer.fit_transform(df_fil['transcription'].tolist() )
feature_names = sorted(vectorizer.get_feature_names_out())
print(feature_names)

In [None]:
import numpy as np
from sklearn.manifold import TSNE

In [None]:
tf_idf_matrix = np.asarray(tf_idf_mat.todense())
labels = df_fil['medical_specialty'].tolist()

tsne_results = TSNE(
    n_components=2, 
    init='random', 
    random_state=0, 
    perplexity=40
).fit_transform(tf_idf_matrix)
plt.figure(figsize=(20, 10))

palette = sns.hls_palette(12, l=.3, s=.9)
sns.scatterplot(
    x=tsne_results[:, 0], y=tsne_results[:, 1],
    hue=labels,
    palette=palette,
    legend="full",
    alpha=0.3
)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_fil['medical_specialty'])
classes = label_encoder.classes_

In [None]:
print(f"Classes: {classes}")
print(f"Class distribution:")
unique, counts = np.unique(y, return_counts=True)
for cls, count in zip(classes, counts):
    print(f"  {cls}: {count}")

## 3. Data Splitting

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
pca = PCA(n_components=0.95)
tf_idf_mat_reduced = pca.fit_transform(tf_idf_mat.toarray())
labels = df_fil['medical_specialty'].tolist()
category_list = df_fil['medical_specialty'].unique()
X_train, X_test, y_train, y_test = train_test_split(
    tf_idf_mat_reduced, 
    labels, 
    stratify=labels,
    random_state=1
)

In [None]:
print(f'Train Set Size: {X_train.shape}')
print(f'Test Set Size:  {X_test.shape}')

## 4. Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)

## 5. Model Evaluation

In [None]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=classes))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(1, 1, 1)
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, cmap="Greens", ax=ax, fmt='g')