# English Text Preprocessor Demo

### Environment Setup

In [1]:
import sys
from pathlib import Path

# Path to the root directory of the project
sys.path.append(str(Path(".../code").resolve()))

from preprocess_en import (
    clean_text, 
    tokenize_text,
    merge_contractions, 
    remove_stopwords, 
    lemmatize_tokens, 
    process_file
)
from utils import read_text_file, write_text_file
import sqlite3
import pandas as pd

# File paths
DATA_DIR = Path("../data")
INPUT_FILE = DATA_DIR / "sample_en.txt"
DB_FILE = DATA_DIR / "processed_results.db"

### Preview Sample Text

In [2]:
lines = read_text_file(INPUT_FILE)
print("Orignal Texts:\n")
for line in lines[:5]:
    print("-", line)

Orignal Texts:

- Artificial intelligence is transforming the world rapidly!
- NASA's new rover landed on Mars in 2021.
- @user I totally disagree — that's not the point. http://example.com
- Big data and ML are changing industries.
- Python, Java, and C++ are popular programming languages.


### Preprocess Sample Text

In [3]:
# Pick a line
text = lines[2]
print(f"Selected Text: {text}")

Selected Text: @user I totally disagree — that's not the point. http://example.com


In [4]:
# Clean text
cleaned = clean_text(text)
print(f"Cleaned Text: {cleaned}")

Cleaned Text: i totally disagree that s not the point


In [5]:
# Tokenize text
tokens = tokenize_text(cleaned)
print(f"Tokens: {tokens}")

Tokens: ['i', 'totally', 'disagree', 'that', 's', 'not', 'the', 'point']


In [6]:
# Remove stopwords
filtered_tokens = remove_stopwords(tokens)
print(f"Filtered Tokens: {filtered_tokens}")

Filtered Tokens: ['totally', 'disagree', 'point']


In [7]:
lemmas = lemmatize_tokens(filtered_tokens)
print(f"Lemmas: {lemmas}")

Lemmas: ['totally', 'disagree', 'point']


### Process and  Store Entire File

In [8]:
process_file(str(INPUT_FILE))
print(f"Processed File Stored in: {DB_FILE}")

Original: Artificial intelligence is transforming the world rapidly!
Cleaned: artificial intelligence is transforming the world rapidly
Tokens: ['artificial', 'intelligence', 'transforming', 'world', 'rapidly']
Lemmas: ['artificial', 'intelligence', 'transform', 'world', 'rapidly']
----------------------------------------
Original: NASA's new rover landed on Mars in 2021.
Cleaned: nasa s new rover landed on mars in 2021
Tokens: ['nasa', 'new', 'rover', 'landed', 'mars', '2021']
Lemmas: ['nasa', 'new', 'rover', 'land', 'mar', '2021']
----------------------------------------
Original: @user I totally disagree — that's not the point. http://example.com
Cleaned: i totally disagree that s not the point
Tokens: ['totally', 'disagree', 'point']
Lemmas: ['totally', 'disagree', 'point']
----------------------------------------
Original: Big data and ML are changing industries.
Cleaned: big data and ml are changing industries
Tokens: ['big', 'data', 'ml', 'changing', 'industries']
Lemmas: ['big'

### Inspect DB Results

In [12]:
# Connect and load data
conn = sqlite3.connect(DB_FILE)
df = pd.read_sql_query("SELECT * FROM processed LIMIT 10;", conn)
conn.close()

df

Unnamed: 0,id,original_text,cleaned_text,tokens,lemmas
0,1,Artificial intelligence is transforming the wo...,artificial intelligence is transforming the wo...,artificial intelligence transforming world rap...,artificial intelligence transform world rapidly
1,2,NASA's new rover landed on Mars in 2021.,nasa s new rover landed on mars in 2021,nasa new rover landed mars 2021,nasa new rover land mar 2021
2,3,@user I totally disagree — that's not the poin...,i totally disagree that s not the point,totally disagree point,totally disagree point
3,4,Big data and ML are changing industries.,big data and ml are changing industries,big data ml changing industries,big datum ml change industry
4,5,"Python, Java, and C++ are popular programming ...",python java and c are popular programming lang...,python java c popular programming languages,python java c popular programming language
5,6,The quick brown fox jumps over the lazy dog.,the quick brown fox jumps over the lazy dog,quick brown fox jumps lazy dog,quick brown fox jump lazy dog
6,7,COVID-19 vaccines are effective and widely dis...,covid 19 vaccines are effective and widely dis...,covid 19 vaccines effective widely distributed,covid 19 vaccine effective widely distribute
7,8,“Hello world!” is the classic first program fo...,hello world is the classic first program for b...,hello world classic first program beginners,hello world classic first program beginner
8,9,Check out this link: https://www.example.org,check out this link,check link,check link
9,10,Email me at john.doe@example.com for details.,email me at john doe com for details,email john doe com details,email john doe com detail


### Visualize Token Count

In [13]:
# Simple Analysis
df["token_count"] = df["tokens"].apply(lambda x: len(x.split()))
df[["original_text", "token_count"]]

Unnamed: 0,original_text,token_count
0,Artificial intelligence is transforming the wo...,5
1,NASA's new rover landed on Mars in 2021.,6
2,@user I totally disagree — that's not the poin...,3
3,Big data and ML are changing industries.,5
4,"Python, Java, and C++ are popular programming ...",6
5,The quick brown fox jumps over the lazy dog.,6
6,COVID-19 vaccines are effective and widely dis...,6
7,“Hello world!” is the classic first program fo...,6
8,Check out this link: https://www.example.org,2
9,Email me at john.doe@example.com for details.,5
