In [13]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('mtsamples.csv')

# Select only the "transcription" column
trans_col = df['transcription']

In [14]:
# Display the first 5 rows
print(trans_col.head(10))

0    SUBJECTIVE:,  This 23-year-old white female pr...
1    PAST MEDICAL HISTORY:, He has difficulty climb...
2    HISTORY OF PRESENT ILLNESS: , I have seen ABC ...
3    2-D M-MODE: , ,1.  Left atrial enlargement wit...
4    1.  The left ventricular cavity size and wall ...
5    PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...
6    PREOPERATIVE DIAGNOSES:,1.  Deformity, right b...
7    2-D ECHOCARDIOGRAM,Multiple views of the heart...
8    PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...
9    DESCRIPTION:,1.  Normal cardiac chambers size....
Name: transcription, dtype: object


In [15]:
trans_col[0]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [16]:
# Display the last 5 rows
print(trans_col.tail(5))

4994    HISTORY:,  I had the pleasure of meeting and e...
4995    ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...
4996    SUBJECTIVE: , This is a 42-year-old white fema...
4997    CHIEF COMPLAINT: , This 5-year-old male presen...
4998    HISTORY: , A 34-year-old male presents today s...
Name: transcription, dtype: object


In [17]:
trans_col.describe()

count                                                  4966
unique                                                 2357
top       PREOPERATIVE DIAGNOSIS: , Low back pain.,POSTO...
freq                                                      5
Name: transcription, dtype: object

In [18]:
import nltk

# Download the NLTK tokenizer model (if not already downloaded)
nltk.download('punkt')

# Check for NaN values and replace them with empty strings
trans_col = trans_col.fillna('')

# Tokenize the "transcription" column
trans_col_tokenize = trans_col.apply(lambda x: nltk.word_tokenize(str(x)))


[nltk_data] Downloading package punkt to C:\Users\Atharva
[nltk_data]     Pawar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
trans_col_tokenize[0]

['SUBJECTIVE',
 ':',
 ',',
 'This',
 '23-year-old',
 'white',
 'female',
 'presents',
 'with',
 'complaint',
 'of',
 'allergies',
 '.',
 'She',
 'used',
 'to',
 'have',
 'allergies',
 'when',
 'she',
 'lived',
 'in',
 'Seattle',
 'but',
 'she',
 'thinks',
 'they',
 'are',
 'worse',
 'here',
 '.',
 'In',
 'the',
 'past',
 ',',
 'she',
 'has',
 'tried',
 'Claritin',
 ',',
 'and',
 'Zyrtec',
 '.',
 'Both',
 'worked',
 'for',
 'short',
 'time',
 'but',
 'then',
 'seemed',
 'to',
 'lose',
 'effectiveness',
 '.',
 'She',
 'has',
 'used',
 'Allegra',
 'also',
 '.',
 'She',
 'used',
 'that',
 'last',
 'summer',
 'and',
 'she',
 'began',
 'using',
 'it',
 'again',
 'two',
 'weeks',
 'ago',
 '.',
 'It',
 'does',
 'not',
 'appear',
 'to',
 'be',
 'working',
 'very',
 'well',
 '.',
 'She',
 'has',
 'used',
 'over-the-counter',
 'sprays',
 'but',
 'no',
 'prescription',
 'nasal',
 'sprays',
 '.',
 'She',
 'does',
 'have',
 'asthma',
 'but',
 'doest',
 'not',
 'require',
 'daily',
 'medication',
 'f

In [23]:
# Convert text to lowercase in the "transcription" column
trans_col_tok_lowC = trans_col_tokenize.apply(lambda x: [word.lower() for word in x])
trans_col_tok_lowC[0]

['subjective',
 ':',
 ',',
 'this',
 '23-year-old',
 'white',
 'female',
 'presents',
 'with',
 'complaint',
 'of',
 'allergies',
 '.',
 'she',
 'used',
 'to',
 'have',
 'allergies',
 'when',
 'she',
 'lived',
 'in',
 'seattle',
 'but',
 'she',
 'thinks',
 'they',
 'are',
 'worse',
 'here',
 '.',
 'in',
 'the',
 'past',
 ',',
 'she',
 'has',
 'tried',
 'claritin',
 ',',
 'and',
 'zyrtec',
 '.',
 'both',
 'worked',
 'for',
 'short',
 'time',
 'but',
 'then',
 'seemed',
 'to',
 'lose',
 'effectiveness',
 '.',
 'she',
 'has',
 'used',
 'allegra',
 'also',
 '.',
 'she',
 'used',
 'that',
 'last',
 'summer',
 'and',
 'she',
 'began',
 'using',
 'it',
 'again',
 'two',
 'weeks',
 'ago',
 '.',
 'it',
 'does',
 'not',
 'appear',
 'to',
 'be',
 'working',
 'very',
 'well',
 '.',
 'she',
 'has',
 'used',
 'over-the-counter',
 'sprays',
 'but',
 'no',
 'prescription',
 'nasal',
 'sprays',
 '.',
 'she',
 'does',
 'have',
 'asthma',
 'but',
 'doest',
 'not',
 'require',
 'daily',
 'medication',
 'f

In [25]:
# Download the NLTK stopwords corpus (if not already downloaded)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Atharva
[nltk_data]     Pawar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# Import the NLTK stopwords
from nltk.corpus import stopwords

# Define a custom list of medical domain-specific stopwords
custom_stopwords = ["medical_term1", "medical_term2", "medical_term3", ...]

# Get the standard English stopwords
english_stopwords = set(stopwords.words('english'))

# Combine the custom medical stopwords and the standard English stopwords
all_stopwords = set(custom_stopwords).union(english_stopwords)

# Remove stopwords from the "transcription" column using the custom list
trans_col_tok_lowC_stopW = trans_col_tok_lowC.apply(lambda x: [word for word in x if word not in all_stopwords])
trans_col_tok_lowC_stopW[0]

['subjective',
 ':',
 ',',
 '23-year-old',
 'white',
 'female',
 'presents',
 'complaint',
 'allergies',
 '.',
 'used',
 'allergies',
 'lived',
 'seattle',
 'thinks',
 'worse',
 '.',
 'past',
 ',',
 'tried',
 'claritin',
 ',',
 'zyrtec',
 '.',
 'worked',
 'short',
 'time',
 'seemed',
 'lose',
 'effectiveness',
 '.',
 'used',
 'allegra',
 'also',
 '.',
 'used',
 'last',
 'summer',
 'began',
 'using',
 'two',
 'weeks',
 'ago',
 '.',
 'appear',
 'working',
 'well',
 '.',
 'used',
 'over-the-counter',
 'sprays',
 'prescription',
 'nasal',
 'sprays',
 '.',
 'asthma',
 'doest',
 'require',
 'daily',
 'medication',
 'think',
 'flaring',
 'up.',
 ',',
 'medications',
 ':',
 ',',
 'medication',
 'currently',
 'ortho',
 'tri-cyclen',
 'allegra.',
 ',',
 'allergies',
 ':',
 ',',
 'known',
 'medicine',
 'allergies.',
 ',',
 'objective',
 ':',
 ',vitals',
 ':',
 'weight',
 '130',
 'pounds',
 'blood',
 'pressure',
 '124/78.',
 ',',
 'heent',
 ':',
 'throat',
 'mildly',
 'erythematous',
 'without',
 

In [28]:
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

In [29]:
# Define a function for lemmatization
def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]

# Apply lemmatization to the "transcription" column
trans_col_tok_lowC_stopW_Lemm = trans_col_tok_lowC_stopW.apply(lemmatize_text)
trans_col_tok_lowC_stopW_Lemm[0]

['subjective',
 ':',
 ',',
 '23',
 '-',
 'year',
 '-',
 'old',
 'white',
 'female',
 'present',
 'complaint',
 'allergy',
 '.',
 'use',
 'allergy',
 'live',
 'seattle',
 'think',
 'bad',
 '.',
 'past',
 ',',
 'try',
 'claritin',
 ',',
 'zyrtec',
 '.',
 'work',
 'short',
 'time',
 'seem',
 'lose',
 'effectiveness',
 '.',
 'use',
 'allegra',
 'also',
 '.',
 'use',
 'last',
 'summer',
 'begin',
 'use',
 'two',
 'week',
 'ago',
 '.',
 'appear',
 'work',
 'well',
 '.',
 'use',
 'over',
 '-',
 'the',
 '-',
 'counter',
 'spray',
 'prescription',
 'nasal',
 'spray',
 '.',
 'asthma',
 'doest',
 'require',
 'daily',
 'medication',
 'think',
 'flare',
 'up',
 '.',
 ',',
 'medication',
 ':',
 ',',
 'medication',
 'currently',
 'ortho',
 'tri',
 '-',
 'cyclen',
 'allegra',
 '.',
 ',',
 'allergy',
 ':',
 ',',
 'know',
 'medicine',
 'allergy',
 '.',
 ',',
 'objective',
 ':',
 ',',
 'vital',
 ':',
 'weight',
 '130',
 'pound',
 'blood',
 'pressure',
 '124/78',
 '.',
 ',',
 'heent',
 ':',
 'throat',
 'm