In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure you download the necessary NLTK resources if running for the first time
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Define the text preprocessing function
def preprocess_text(text_list, csv_file):
    """
    Preprocesses the input text and CSV file containing codes and descriptions.
    
    Args:
        text_list (list): List of extracted strings from the pitch deck.
        csv_file (str): Path to the CSV file containing 'Code' and 'Description'.
    
    Returns:
        processed_text_list (list): List of preprocessed strings from the pitch deck.
        cpt_df (pd.DataFrame): DataFrame with preprocessed CPT code descriptions.
    """
    # Load the CSV file into a DataFrame
    cpt_df = pd.read_csv(csv_file)
    
    # Preprocess both the extracted text and the CPT descriptions
    def clean_text(text):
        # Lowercase
        text = text.lower()
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        
        # Tokenize the text
        words = word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
        
        # Join words back to a cleaned sentence
        cleaned_text = ' '.join(words)
        
        return cleaned_text
    
    # Apply preprocessing to the pitch deck text
    processed_text_list = [clean_text(text) for text in text_list]
    
    # Apply preprocessing to the CPT descriptions in the DataFrame
    cpt_df['Description'] = cpt_df['Description'].apply(clean_text)
    
    return processed_text_list, cpt_df

In [None]:
csv_file_path = 'cpt_codes.csv'  # Path to your CSV file containing CPT codes and descriptions

# Preprocess the text and CPT descriptions
processed_text, processed_cpt_df = preprocess_text(pitch_deck_text, csv_file_path)

# Display the processed outputs
print("Processed Text List from Pitch Deck:")
print(processed_text)
print("\nProcessed CPT DataFrame:")
print(processed_cpt_df.head())