# Medical Domain Identifier using Natural Language Processing

## 1. Data Collection

In [None]:
import kagglehub
import os
import pandas as pd

In [None]:
dataset_path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
df = pd.read_csv(os.path.join(dataset_path, "mtsamples.csv"))

In [None]:
print(df.columns.to_list())

In [None]:
from tabulate import tabulate

In [None]:
print(tabulate(df.head(), headers="keys", tablefmt="grid"))

## 2. Data Exploration & Preprocessing

In [None]:
df_cat  = df.groupby('medical_specialty')

In [None]:
headers = ["#", "Category", "Frequency"]
data = [[a + 1, b, len(c)] for a, (b, c) in enumerate(df_cat)]

print('Original Categories')
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
df_cat_fil = df_cat.filter(lambda x: x.shape[0] > 50)
df_cat_fin = df_cat_fil.groupby('medical_specialty')

In [None]:
data = [[a + 1, b, len(c)] for a, (b, c) in enumerate(df_cat_fin)]

print('Reduced Categories')
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
import random

In [None]:
headers = ["#", "Category", "Sample Transcription"]
data = [
    [
        a + 1,
        b,
        random.choice(c["transcription"].tolist())
    ]
    for a, (b, c) in enumerate(df_cat_fin)
]

print("Category Transcription Examples")
print(tabulate(data, headers=headers, tablefmt="grid"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='medical_specialty', data=df_cat_fil)
plt.show()

In [None]:
df_fil = df_cat_fil[['transcription', 'medical_specialty']]
df_fil = df_fil.drop(df_fil[df_fil['transcription'].isna()].index)
df_fil.shape

In [None]:
print("Before cleaning text:\n")
print(f'Sample Transcription 1:\n{df_fil.iloc[5]["transcription"]}\n')
print(f'Sample Transcription 2:\n{df_fil.iloc[125]["transcription"]}\n')
print(f'Sample Transcription 3:\n{df_fil.iloc[1000]['transcription']}')

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

In [None]:
def clean_and_preprocess_text(text):
    if pd.isna(text):
        return ""

    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(token) for token in tokens
        if token not in stop_words and len(token) > 2
    ]

    return ' '.join(tokens)

In [None]:
df_fil['transcription'] = df_fil['transcription'].apply(
    clean_and_preprocess_text
)

In [None]:
print("After cleaning text:\n")
print(f'Sample Transcription 1:\n{df_fil.iloc[5]["transcription"]}\n')
print(f'Sample Transcription 2:\n{df_fil.iloc[125]["transcription"]}\n')
print(f'Sample Transcription 3:\n{df_fil.iloc[1000]['transcription']}')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    max_df=0.75,
    use_idf=True,
    smooth_idf=True,
    max_features=1000
)
tf_idf_mat = vectorizer.fit_transform(df_fil['transcription'].tolist())

In [None]:
feature_names = sorted(vectorizer.get_feature_names_out())
print(feature_names)

In [None]:
import numpy as np
from sklearn.manifold import TSNE

In [None]:
tf_idf_den = np.asarray(tf_idf_mat.todense())
labels = df_fil['medical_specialty'].tolist()

tsne_results = TSNE(
    n_components=2, 
    init='random', 
    random_state=0, 
    perplexity=40
).fit_transform(tf_idf_den)
plt.figure(figsize=(20, 10))

palette = sns.hls_palette(len(set(labels)), l=.3, s=.9)
sns.scatterplot(
    x=tsne_results[:, 0], y=tsne_results[:, 1],
    hue=labels,
    palette=palette,
    legend="full",
    alpha=0.3
)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_fil['medical_specialty'])
classes = label_encoder.classes_

In [None]:
print("\n".join(classes))

## 3. Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = tf_idf_mat
y = df_fil['medical_specialty'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    stratify=labels,
    random_state=1
)

In [None]:
print(f'Train Set Size: {X_train.shape}')
print(f'Test Set Size:  {X_test.shape}')

## 4. Model Training

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(
    LinearSVC(max_iter=2000, class_weight='balanced'), 
    param_grid, 
    cv=5
)
grid.fit(X_train, y_train)

## 5. Model Evaluation

In [None]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report

In [None]:
print("Classification Report:")
print(classification_report(
    y_test, 
    y_test_pred, 
    target_names=classes,
    zero_division=np.nan
))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(1, 1, 1)
sns.heatmap(
    confusion_matrix(
        y_test,
        y_test_pred
    ),
    annot=True,
    cmap="Greens",
    ax=ax,
    fmt='g',
    xticklabels=classes,
    yticklabels=classes
)