Preprocessing pipeline

In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from num2words import num2words

In [2]:
def text_preprocessing(input_text):
    
    def decode_encoding(text):
        # Remove start and end tags
        decoded_text = re.sub(r"<.*?>", "", input_text)
        # Replace newline characters with spaces
        decoded_text = re.sub(r'\n', ' ', decoded_text)
        # Normalize multiple spaces to a single space
        decoded_text = re.sub(r'\s+', ' ', decoded_text).strip()

        # Insert period after "Employees details" if it directly precedes ' Attached'
        return re.sub(r"Employees details\s*Attached", "Employees details. Attached", decoded_text)

    def lowercase_text(text):
        return text.lower()

    def digits_to_words(text):
        ordinal_dict = {
            '1st': 'first',
            '2nd': 'second',
        }
        text = re.sub(r'\b1st\b', 'first', text)
        text = re.sub(r'\b2nd\b', 'second', text)
        return re.sub(r'\d', lambda x: num2words(int(x.group())), text)
    
    def remove_punctuation(text):
        return re.sub(r'[^\w\s]', '', text)

    def correct_spelling(text):
        spelling_corrections = {
            "pairoll": "payroll",
            "healtcare": "healthcare"
        }
        return ' '.join(spelling_corrections.get(word, word) for word in text.split())
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))
        return ' '.join(word for word in text.split() if word not in stop_words)

    def perform_stemming(text):
        stemmer = PorterStemmer()
        return ' '.join(stemmer.stem(word) for word in text.split())
    
    def perform_lemmatization(text):
        lemmatizer = WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    
    decoded_text = decode_encoding(input_text)
    print("Step 1: Decoding text:", decoded_text)

    lowercased_text = lowercase_text(decoded_text)
    print("Step 2: Lowercasing text:", lowercased_text)

    digits_to_words_text = digits_to_words(lowercased_text)
    print("Step 3: Digits to words:", digits_to_words_text)

    no_punctuation_text = remove_punctuation(digits_to_words_text)
    print("Step 4: Punctuation and special characters' removal:", no_punctuation_text)

    spelling_corrected_text = correct_spelling(no_punctuation_text)
    print("Step 5: Spelling corrections:", spelling_corrected_text)

    stopword_removed_text = remove_stopwords(spelling_corrected_text)
    print("Step 6: Stopword removal:", stopword_removed_text)

    stemmed_text = perform_stemming(stopword_removed_text)
    print("Step 7: Stemming:", stemmed_text)

    lemmatized_text = perform_lemmatization(stemmed_text)
    print("Step 8: Lemmatizing:", lemmatized_text)

    return lemmatized_text

In [3]:
input_text = "<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,\n1st one is pairoll, 2nd is healtcare!<END>"
result = text_preprocessing(input_text)
print("Final output:", result)

Step 1: Decoding text: Employees details. Attached are 2 files, 1st one is pairoll, 2nd is healtcare!
Step 2: Lowercasing text: employees details. attached are 2 files, 1st one is pairoll, 2nd is healtcare!
Step 3: Digits to words: employees details. attached are two files, first one is pairoll, second is healtcare!
Step 4: Punctuation and special characters' removal: employees details attached are two files first one is pairoll second is healtcare
Step 5: Spelling corrections: employees details attached are two files first one is payroll second is healthcare
Step 6: Stopword removal: employees details attached two files first one payroll second healthcare
Step 7: Stemming: employe detail attach two file first one payrol second healthcar
Step 8: Lemmatizing: employe detail attach two file first one payrol second healthcar
Final output: employe detail attach two file first one payrol second healthcar


NER & POS

In [4]:
import spacy


In [5]:
def perform_ner_and_pos(text):
    # Load the spaCy model
    # Load the spaCy model globally to avoid reloading it multiple times
    nlp = spacy.load("en_core_web_sm")

    # Process the input text
    doc = nlp(text)

    # Named Entity Recognition (NER)
    ner_output = [(ent.text, ent.label_) for ent in doc.ents]

    # Part-of-Speech (POS) tagging
    pos_output = [(token.text, token.pos_) for token in doc]

    return ner_output, pos_output

In [6]:
input_text = "The companies that would be releasing their quarterly reports tomorrow are Microsoft, 4pm, Google, 4pm, and AT&T, 6pm."
ner_result, pos_result = perform_ner_and_pos(input_text)

print("NER Output:")
for ent in ner_result:
    print(f"{ent[0]} {ent[1]}")

print("\nPOS Output:")
for token_pos in pos_result:
    print(token_pos)

NER Output:
quarterly DATE
tomorrow DATE
Microsoft ORG
4pm TIME
Google ORG
4pm TIME
AT&T ORG
6pm TIME

POS Output:
('The', 'DET')
('companies', 'NOUN')
('that', 'PRON')
('would', 'AUX')
('be', 'AUX')
('releasing', 'VERB')
('their', 'PRON')
('quarterly', 'ADJ')
('reports', 'NOUN')
('tomorrow', 'NOUN')
('are', 'AUX')
('Microsoft', 'PROPN')
(',', 'PUNCT')
('4', 'NUM')
('pm', 'NOUN')
(',', 'PUNCT')
('Google', 'PROPN')
(',', 'PUNCT')
('4', 'NUM')
('pm', 'NOUN')
(',', 'PUNCT')
('and', 'CCONJ')
('AT&T', 'PROPN')
(',', 'PUNCT')
('6', 'NUM')
('pm', 'NOUN')
('.', 'PUNCT')
