# Cleaning Raw Data (Without Regex Pattern)

In [2]:
import re

noised_text = "teh qwik brwon fx jmps ovr teh lazi dag!!!"

def preprocessing_step(text):
  text = text.lower() # lowercase all senctences
  text = re.sub('teh', 'the', text)
  text = re.sub('dag','dog', text)
  text = re.sub('qwik', 'quick', text)
  text = re.sub('jmps', 'jumps', text)
  text = re.sub('brwon', 'brown', text)
  text = re.sub('lazi', 'lazy', text)
  text = re.sub('fx', 'fox', text)
  text = re.sub('ovr', 'over', text)
  return text

print(preprocessing_step(noised_text))


the quick brown fox jumps over the lazy dog!!!


In [3]:
# 1. The quick brown fox jumps over the lazy dog
noised_text = "teh qwik brwon fx jmps ovr teh lazi dag!!!"

# Preprocessing
def preprocessing_step_optimized(text):
    text = text.lower()
    replacements = {
        'teh': 'the',
        'dag': 'dog',
        'qwik': 'quick',
        'jmps': 'jumps',
        'brwon': 'brown',
        'lazi': 'lazy',
        'fx': 'fox',
        'ovr': 'over'
    }
    # Use a regex pattern to find words and replace them if in the replacements dictionary
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacements.keys()) + r')\b')
    text = pattern.sub(lambda x: replacements[x.group(0)], text)
    return text

print(preprocessing_step_optimized(noised_text))

the quick brown fox jumps over the lazy dog!!!


In [4]:
# using regex pattern
raw_text = "D@ta prepr0cessing iz essenntial 4 accurte NLP modls"

#2. Data preprocessing is essential for accurate NLP models.


# Preprocessing
def clean_text_with_regex(text):
  text = text.lower()
  reg = r"[^a-zA-Z\s]"
  cleaned_text = re.sub(reg, '', text)
  replacement = {
      "dta":'data',
      "preprcessing":'preprocessing',
      "essenntial":'essential',
      "accurte":'accurate',
      "modls":'models',
      'iz': 'is'
  }

  pattern = re.compile(r'\b('+ '|'.join(re.escape(key) for key in replacement.keys())+ r')\b')
  cleaned_text = pattern.sub(lambda x: replacement[x.group(0)], cleaned_text)
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)
  return cleaned_text

clean_text_with_regex(raw_text)

'data preprocessing is essential accurate nlp models'

In [5]:
#3. He couldn’t believe how effective the algorithm was.
raw_text = "He cudnt beleev how effctiv the algorithem wuz"

# Preprocessing
def prep_step(txt):
  txt = txt.lower()
  replacement = {
      'cudnt': 'could not',
      'effctiv': 'effective',
      'algorithem': 'algorithm',
      'wuz': 'was',
      'beleev': 'believe'
  }
  pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacement.keys()) + r')\b')
  txt = pattern.sub(lambda x: replacement[x.group(0)], txt)
  return txt

cleaned_text3 = prep_step(raw_text)
print(cleaned_text3)

he could not believe how effective the algorithm was


In [6]:
#4. Natural language understanding is a core task in AI.
raw_text =  "Naturall lngg understanting's a coor tazk in AI"

# Preprocessing
def clean_text(text):
  text = text.lower()
  replacement = {
      'Naturall':'Natural',
      'lngg': 'language',
      "understanting's": 'understanding is',
      'coor': 'core',
      'tazk': 'task'
  }
  pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacement.keys()) + r')\b')
  text = pattern.sub(lambda x: replacement[x.group(0)], text)
  return text

clean_text4 = clean_text(raw_text)

In [7]:
#5. Tokenization splits text into smaller units for analysis.

raw_text = 'Tokenisashun spl1tz txt into smoller unitts fr analysys'

# Preprocessing
def clean_text(text):
  text = text.lower()
  replacement = {
      'tokenisashun': 'tokenization',
      'spl1tz': 'splits',
      'txt': 'text',
      'into': 'for',
      'smoller': 'smaller',
      'unitts': 'units',
      'fr': 'for',
      'analysys': 'analysis'
  }

  pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacement.keys())+r')\b')
  text = pattern.sub(lambda x: replacement[x.group(0)], text)
  return text

clean_text5 = clean_text(raw_text)

# Using Regex Pattern

In [13]:
# Focus: Remove non-alphabetic characters, preserve meaningful tokens.
raw_data ='@rguments%%can-b-#illogical...!!'
reg = '[^\w\s+]'
replacement = {
    'rguments' : 'arguments',
    'b': 'be'
}

clean_txt = re.sub(reg, ' ', raw_data)
pattern = re.compile(r'\b(' + "|".join(re.escape(key) for key in replacement.keys()) + r')\b')
clean_txt = pattern.sub(lambda x: replacement[x.group(0)], clean_txt)
print(clean_txt)

 arguments  can be  illogical     


In [14]:
# Word correction via \b() groups and replacements.
raw_data = 'Speling missteaks izz nott alweys obvius'
replacement = {
    'missteaks': 'mistakes',
    'izz': 'is',
    'alweys': 'always',
    'nott': 'not',
    'Speling':'Spelling'
}
pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacement.keys()) + r')\b')
clean_txt = pattern.sub(lambda x: replacement[x.group(0)], raw_data)
print(clean_txt)

Spelling misspelling is not always obvius


In [20]:
# Detect and reduce excessive character repetition (hint: (.)\1{2,}).
raw_data = 'Repetitionnnnn issssss an errrrorrrr...'
regex = r'(.)\1{2,}'
clean_data = re.sub(regex, r'\1', raw_data)
print(clean_data)

Repetition is an eror.


In [23]:
# Extract words amidst numbers/underscores using \w, \d, and boundaries.
raw_data = '123Repl4ce_th3nClean567!'
regex = r'\w+'
clean_data = re.sub(regex, 'Replace then Clean', raw_data)
print(clean_data)

Replace then Clean!


In [31]:
# Smart tokenization—preserve underscores in named entities, split on dashes/punctuation elsewhere.
raw_data = 'do_not-split_me.but-split_this one'
regex = r'\W+'
clean_data = re.sub(regex, ' ', raw_data)
print(clean_data)

do_not split_me but split_this one
