# Step 1: Reading the CSV File
First, read the CSV file containing the physician's notes.

In [1]:
import pandas as pd

df = pd.read_csv('patient_notes.csv')

# notes are in a column named 'pn_history'
notes = df['pn_history']

# Step 2: Case Conversion
Convert all text to lower case to ensure uniformity.

In [2]:
notes = notes.str.lower()

# Step 3: Handling Contractions
Expand contractions using contractions library:

In [3]:
import contractions

def expand_contractions(text):
    return contractions.fix(text)

notes = notes.apply(expand_contractions)

# Step 4: Standardizing Formats
Standardize formats for dates, numbers, and currencies using regular expressions and Python's datetime library for dates.

In [4]:
import re
from dateutil.parser import parse

# Define a function to standardize date formats
def standardize_dates(text):
    try:
        # Try to parse and reformat dates found in the text
        return parse(text, fuzzy=True).strftime('%Y-%m-%d')
    except ValueError:
        # If parsing fails, return the original text
        return text

# Apply the function to standardize dates in each note
notes = notes.apply(lambda x: re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', lambda match: standardize_dates(match.group()), x))

# Define a function to standardize age
def standardize_age(text):
    return re.sub(r'(\d{1,2})\s?(?:yo|y\.o.|y\.o)', lambda match: f"{match.group(1)} years old", text)

notes = notes.apply(standardize_age)


# Define a function to standardize numbers (remove commas from large numbers)
def standardize_numbers(text):
    return re.sub(r'(\d{1,3}),(\d{3}\b)', r'\1\2', text)

# Apply the function to standardize numbers in each note
notes = notes.apply(standardize_numbers)

# Define a function to standardize currency symbols to their text equivalents
def standardize_currencies(text):
    # Replace the US Dollar symbol with 'USD'
    text = re.sub(r'\$', 'USD ', text)
    # Replace the British Pound symbol with 'GBP'
    text = re.sub(r'£', 'GBP ', text)
    # Replace the Euro symbol with 'EUR'
    text = re.sub(r'€', 'EUR ', text)
    return text

# Apply the function to standardize currencies in each note
notes = notes.apply(standardize_currencies)

# Step 5: Removing Punctuation and Special Characters
Replace non-alphanumeric characters with a space to prevent merging of words, negleting the formated dates

In [5]:
# Updated function to remove all punctuation and special characters
def remove_all_punctuation(text):
    # Regular expression that removes all non-alphanumeric characters (except spaces)
    return re.sub(r'[^\w\s]', ' ', text)

# Apply the updated function to remove all punctuation
notes = notes.apply(remove_all_punctuation)


# Step 6: Stemming and Lemmatization
After addressing the specific format standardizations, consider reducing words to their root forms

In [6]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in word_tokenize(text)])

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])

# Choose either stemming or lemmatization based on your analysis needs
notes = notes.apply(stem_text)
# notes = notes.apply(lemmatize_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\52347\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\52347\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Step 7: Apply a Stop Word List
Remove common words that add little semantic value to reduce noise and focus on the meaningful content in the text.

In [7]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join([word for word in word_tokenize(text) if word not in stop_words])

notes = notes.apply(remove_stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\52347\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 8: Correcting Typos and Spelling
For correcting typos and spelling, use Speller, utilizing multithreading and put this step at end to reduce processing time.

In [8]:
from autocorrect import Speller
from concurrent.futures import ThreadPoolExecutor

spell = Speller(lang='en', fast=True)

def correct_spelling(text):
    corrected_text = spell(text)
    return corrected_text

# Function to apply spelling correction using multithreading
def apply_spell_correction_with_multithreading(series, num_workers=16):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Distribute the workload across multiple threads and collect the results
        corrected_texts = list(executor.map(correct_spelling, series))
    return corrected_texts

# Apply the multithreaded spelling correction
notes = apply_spell_correction_with_multithreading(notes)

# Step 9: Update DataFrame
Update your DataFrame with the processed notes.

In [9]:
df['processed_pn_history'] = notes

# Step 10: Save Processed Data
Save the processed data for further analysis or use in subsequent machine learning models.

In [10]:
df.to_csv('processed_patient_notes.csv', index=False)
