In [27]:
import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [28]:
df = pd.read_csv('/Users/jheelkamdar/Downloads/CS6120/A1/patient_notes.csv')
print("Original DataFrame:")
print(df.head())
print("\n")

Original DataFrame:
   pn_num  case_num                                         pn_history
0       0         0  17-year-old male, has come to the student heal...
1       1         0  17 yo male with recurrent palpitations for the...
2       2         0  Dillon Cleveland is a 17 y.o. male patient wit...
3       3         0  a 17 yo m c/o palpitation started 3 mos ago; \...
4       4         0  17yo male with no pmh here for evaluation of p...




In [29]:
df['pn_history'] = df['pn_history'].str.lower()
print("After Case Conversion:")
print(df['pn_history'].head())
print("\n")

After Case Conversion:
0    17-year-old male, has come to the student heal...
1    17 yo male with recurrent palpitations for the...
2    dillon cleveland is a 17 y.o. male patient wit...
3    a 17 yo m c/o palpitation started 3 mos ago; \...
4    17yo male with no pmh here for evaluation of p...
Name: pn_history, dtype: object




In [30]:
df['pn_history'] = df['pn_history'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
 

print("After Removing Punctuation:")
print(df['pn_history'].head())
print("\n")

After Removing Punctuation:
0    17yearold male has come to the student health ...
1    17 yo male with recurrent palpitations for the...
2    dillon cleveland is a 17 yo male patient with ...
3    a 17 yo m co palpitation started 3 mos ago \r\...
4    17yo male with no pmh here for evaluation of p...
Name: pn_history, dtype: object




In [31]:
import pkg_resources
from symspellpy import SymSpell, Verbosity
import time

# Load the pre-built dictionary
sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Function for correcting spelling in a text
def correct_spelling(text):
    suggestions = sym_spell.lookup(text, Verbosity.CLOSEST, max_edit_distance=2)
    corrected_text = suggestions[0].term if suggestions else text
    return corrected_text

# Assuming df['pn_history'] contains your physician notes
start_time = time.time()
df['pn_history'] = df['pn_history'].apply(correct_spelling)
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

# Print the corrected dataframe
print(df['pn_history'])


Execution time: 0.04101991653442383 seconds
0        17yearold male has come to the student health ...
1        17 yo male with recurrent palpitations for the...
2        dillon cleveland is a 17 yo male patient with ...
3        a 17 yo m co palpitation started 3 mos ago \r\...
4        17yo male with no pmh here for evaluation of p...
                               ...                        
42141    ms madden is a 20 yo female presenting w the w...
42142    a 20 yo f came complain a dull 810 headache th...
42143    ms madden is a 20yo female who presents with a...
42144    stephanie madden is a 20 year old woman compla...
42145    patient is a 20 yo f who presents with a heada...
Name: pn_history, Length: 42146, dtype: object


In [32]:
df['pn_history'] = df['pn_history'].str.replace(r'(\d{2})-(\d{2})-(\d{4})', 'date_placeholder')

df['pn_history'] = df['pn_history'].str.replace(r'\b\d+\b', 'number_placeholder')

df['pn_history'] = df['pn_history'].str.replace(r'\$\s?\d+(\.\d{2})?', 'currency_placeholder')
print("After Standardizing Formats:")
print(df['pn_history'].head())
print("\n")

After Standardizing Formats:
0    17yearold male has come to the student health ...
1    17 yo male with recurrent palpitations for the...
2    dillon cleveland is a 17 yo male patient with ...
3    a 17 yo m co palpitation started 3 mos ago \r\...
4    17yo male with no pmh here for evaluation of p...
Name: pn_history, dtype: object




In [33]:
# Define a dictionary of common contractions and their expansions
contractions_dict = {
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "don't": "do not",
    "yo": "year old", 
    "y/o": "year old",
    "yearold": "year old",
    "mo": "month old",
    "m/o": "month old",
    "bp": "blood pressure",
    "c/o": "complaint of",
    "co": "complaint of",
    "pmh" : "past medical history",
    "psh" : "past surgical history"
}

# Function to replace contractions using regex
def replace_contractions(text):
    for contraction, expansion in contractions_dict.items():
        text = re.sub(fr'\b{contraction}\b', expansion, text)
    return text

# Apply the contractions replacement function to the 'physician_notes' column
df['pn_history'] = df['pn_history'].apply(replace_contractions)
print(df['pn_history'].head())
print("\n")

0    17yearold male has come to the student health ...
1    17 year old male with recurrent palpitations f...
2    dillon cleveland is a 17 year old male patient...
3    a 17 year old m complaint of palpitation start...
4    17yo male with no past medical history here fo...
Name: pn_history, dtype: object




In [34]:
def replace_gender_in_age(text):
    # Define a regex pattern to identify "m" or "f" before "year old"
    pattern = re.compile(r'\b([mf])\s+year\s+old\b', re.IGNORECASE)
    # Replace "m" with "male" and "f" with "female"
    updated_text = re.sub(pattern, lambda match: 'male' if match.group(1).lower() == 'm' else 'female', text)
    return updated_text

df['pn_history'] = df['pn_history'].apply(replace_gender_in_age)
print(df['pn_history'].head())
print("\n")

0    17yearold male has come to the student health ...
1    17 year old male with recurrent palpitations f...
2    dillon cleveland is a 17 year old male patient...
3    a 17 year old m complaint of palpitation start...
4    17yo male with no past medical history here fo...
Name: pn_history, dtype: object




In [35]:
vectorizer = CountVectorizer()
pn_history_vector = vectorizer.fit_transform(df['pn_history'])
print(pn_history_vector.shape)
feature_names = vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))

pn_history_df = pd.DataFrame(pn_history_vector.toarray(), columns=feature_names)
pn_history_df.head()

(42146, 64575)
Number of features: 64575


Unnamed: 0,00,000,0000,004am,00h,01,010,010510,011,0110,...,zeromonth,zexually,zigzag,ziminopril,zno,zolpidem,zone,zones,zopidem,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# ---------------------------------------
# Initialize stemmer and lemmatizer
# ---------------------------------------
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    # Remove special characters and numbers
    text = re.sub(r'\W|\d', ' ', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Apply stemming and lemmatization
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    return lemmatized_tokens

# ---------------------------------------
# Apply customized tokenizer
# ---------------------------------------

vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
pn_history_vector = vectorizer.fit_transform(df['pn_history'])



In [37]:
stopwords = CountVectorizer(stop_words='english').get_stop_words()
len(stopwords)


# ------------------------------------------------------------------------------
# Adding words to the stop words list 
# (Note the choice of additional stop words is only for illustration)
# ------------------------------------------------------------------------------

nstopwords = list(stopwords) + ['abdomin', 'abdomen', 'abdonmin','age']
print(len(stopwords))
print(len(nstopwords))

# -------------------------------------------------------- 
# repeating the process with the new list of stopwords
# -------------------------------------------------------- 

vectorizer2 = CountVectorizer(tokenizer=custom_tokenizer, stop_words=nstopwords)


# Fit and transform the 'pn_history' column
pn_history_vector = vectorizer2.fit_transform(df['pn_history'])

# Get the feature names (tokens)
feature_names2 = vectorizer2.get_feature_names_out()


print("First few ORIGINAL words:", feature_names[0:100])

print("---------")

print("First few words after new stop list:", feature_names2[0:100])


318
322




First few ORIGINAL words: ['00' '000' '0000' '004am' '00h' '01' '010' '010510' '011' '0110' '0115lb'
 '0151' '01ppd' '02' '0202' '03' '03000400' '0311' '0319' '04' '040'
 '0400' '041' '04c' '04cage' '04currently' '04denies' '04h00' '04no'
 '04sa' '04sexually' '04than' '04tobsince' '04use' '05' '051' '0510'
 '0510ppd' '0515' '051hr' '051pach' '051pack' '051packday' '051packet'
 '051packs' '051packsday' '051pcks' '051pday' '051pk' '051pkday' '051ppd'
 '051ppd15' '051ppd15yo' '051ppd20' '051ppd20y' '051ppd20yr' '051ppd20yrs'
 '051ppdday' '051ppdx' '051ppdx20' '051ppdx20years' '051ppdx20yrs'
 '051ppf' '052' '052pkd' '052ppd' '0531' '054' '05h00' '05ppd'
 '05ppdx20yrs' '05to' '05x20yrs' '06' '06092017' '061317' '07' '075' '08'
 '0827' '09' '0930' '0at' '0cage' '0ccup' '0ct' '0f' '0g0p' '0monogamous'
 '0nce' '0ppd' '0r' '0s' '0sex' '0tobacc0' '0usually' '10' '100' '1000'
 '1000f']
---------
First few words after new stop list: ['aa' 'aabdomen' 'aabov' 'aaccompani' 'aaccord' 'aach' 'aactiv' '

In [40]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words=nstopwords)

# Fit and transform the 'pn_history' column
pn_history_vector = vectorizer.fit_transform(df['pn_history'])

# Print the shape of the vectorized 'pn_history' column
print(pn_history_vector.shape)

# Get the feature names (tokens)
feature_names = vectorizer.get_feature_names_out()

# Print the number of features (tokens)
print("Number of features:", len(feature_names))

# Print the first 100 features
print(feature_names)



(42146, 50271)
Number of features: 50271
['aa' 'aabdomen' 'aabov' ... 'zone' 'zopidem' 'zzz']


In [None]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words=nstopwords, use_idf=False)

# Fit and transform the 'pn_history' column
pn_history_vector_count = vectorizer.fit_transform(df['pn_history'])

# Get the feature names (tokens)
feature_names = vectorizer.get_feature_names_out()

# Create DTM with columns as tokens and rows as documents
pn_history_df_count = pd.DataFrame(pn_history_vector_count.toarray(), columns=feature_names)
pn_history_df_count