# The goal
The goal of this notebook is to test different preprocessors to improve the quality of the project name extraction process

## Sonnet's advice

In [1]:
%pip install nltk
%pip install regex

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import regex
from typing import List
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def preprocess_text(text: str, remove_stopwords: bool = True, lowercase: bool = True) -> str:
    # Convert to lowercase if specified
    if lowercase:
        text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords if specified
    if remove_stopwords:
        stop_words = set(stopwords.words('german'))
        tokens = [token for token in tokens if token not in stop_words]
    
    # Join the tokens back into a string
    processed_text = ' '.join(tokens)
    
    # Remove extra whitespace
    processed_text = ' '.join(processed_text.split())
    
    return processed_text

def prepare_input(input_text: str, project_names: List[str]) -> str:
    # Preprocess the input text
    processed_input = preprocess_text(input_text)
    
    # Add context from project names
    project_keywords = set()
    for name in project_names:
        project_keywords.update(preprocess_text(name).split())
    
    # Add relevant keywords from project names to the input if they're not already present
    additional_context = ' '.join([kw for kw in project_keywords if kw not in processed_input.split()])
    
    return f"{processed_input} {additional_context}".strip()

# Example usage
project_names = ["Project Alpha", "Beta System", "Gamma Framework"]
input_text = "We need to update the documentation for the alpha project."
prepared_input = prepare_input(input_text, project_names)
print(f"Original input: {input_text}")
print(f"Prepared input: {prepared_input}")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/jovyan/nltk_data'
    - '/opt/conda/nltk_data'
    - '/opt/conda/share/nltk_data'
    - '/opt/conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## ChatGPT's advice

In [3]:
import re
import string
from nltk.corpus import stopwords

class TextPreprocessor:
    def __init__(self, stop_words_language="german"):
        self.stop_words = set(stopwords.words(stop_words_language))

    def preprocess(self, text: str) -> str:
        # Remove extra spaces
        text = text.strip()

        # Remove punctuation (except for meaningful ones like apostrophes)
        text = text.translate(str.maketrans('', '', string.punctuation.replace("'", "")))

        # Lowercase the text
        text = text.lower()

        # Remove stopwords
        text = ' '.join([word for word in text.split() if word not in self.stop_words])

        # Remove digits
        text = re.sub(r'\d+', '', text)

        return text


In [5]:
preprocessor = TextPreprocessor()
text = "PRACHT-ALTBAU SENDLINGER TOR Sehr geehrte Damen und Herren in ihrem Auszahlungsplan wird für den 30.07.2020 Verzugszinsen in Höhe von 24,53 € brutto und die Zahlung 6 mit 100 € ausgewiesen. Die 100 € wurden meinem Konto am 25.07.2020 gutgeschrieben. Eine Zahlung der Zinsen erfolgte durch den Dienstleister nicht. Bitte teilen Sie mir mit, wann die eingestellte Zinszahlung erfolgen wird. Vielen Danke für Ihre Rückmeldung. Mit freundlichen Grüßen Max Mustermann"
cleaned_text = preprocessor.preprocess(text)
print(cleaned_text)


prachtaltbau sendlinger tor geehrte damen herren auszahlungsplan  verzugszinsen höhe  € brutto zahlung   € ausgewiesen  € wurden konto  gutgeschrieben zahlung zinsen erfolgte dienstleister bitte teilen wann eingestellte zinszahlung erfolgen vielen danke rückmeldung freundlichen grüßen max mustermann
