In [23]:
import re #For regular expression searching
import os
import string 
from bs4 import BeautifulSoup #For Deleting HTML tag
import emoji 
import pandas as pd
import nltk #For Tokenization and Lemmatization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy #For More accurate Lemmatization
from tqdm.notebook import tqdm 

In [3]:
#Several necessary packages
#nltk
try:
    nltk.download('punkt',quiet=True) #for word_toknize,quiet mode
    nltk.download('stopwords',quiet=True) #Download dictionary of stopwords
    nltk.download('wordnet',quiet=True) #Dataset of word lemmatization 
except Exception as e:
    print(f'Wrong for downloading:{e}')

#Spacy Faster and more accurate lemmatization
try:
    nlp = spacy.load('en_core_web_sm',disable=["parser","ner"]) #Don't need to download parser and ner
except:
    nlp = None
    
stop_words = set(stopwords.words('english')) #Transfer list to set, O(n) --> O(1)
lemmatizer = WordNetLemmatizer()

In [4]:
print(nlp)

<spacy.lang.en.English object at 0x16c4653d0>


In [5]:
def pre_clean(text):
    """Pre-clean function: 
    like removing HTML tag; Emoji and special punctuation"""
    if not isinstance(text,str) or not text.strip(): #if text is str format or only whitespace
        return ""
    # remove html tag
    text = BeautifulSoup(text,'html.parser').get_text() 
    
    # remove emoji to null string
    text = emoji.replace_emoji(text,replace = '')
    
    #remove special character string
    text = re.sub(r'[^\w\s.,!?\'"-]', '', text)
    #we define legal character,any character not in list will be removed
    
    #remove several whitespace/tab to single whitespace
    text = re.sub(r'\s+',' ',text.strip())
    
    #remove several whitespace/tab before puncuation
    text = re.sub(r'\s+([.,!?])',r'\1',text)
    text = re.sub(r'\s+',' ',text.strip())
    
    text = re.sub(r'www.\w+.com','',text)
    
    return text

In [6]:
str_example = 'any %&((() üòÅ))   student $@***! www.example.com'
print(pre_clean(str_example))

any student! 


In [None]:
# def clean_english_text(
#     text,
#     batch_size = 500, #less memory
#     n_process  = 4,
#     use_spacy_lemmatize:bool = True,
#     remove_stopwords:bool = True,
#     show_progress = True): #return iteration
#     """Clean English text and return both cleaned result and statistics
#     """
#     if isinstance(text,str):
#         text = [text]
        
#     if nlp is None:print('NLP no download')
#     # Use nlp.pipe() for batch processing
#     #nlp.pipe() us designed for large-scale text processing
#     pipe_iterator = nlp.pipe(
#         text,
#         batch_size=batch_size,
#         n_process = n_process,
#         disable = ['parser','ner'] if use_spacy_lemmatize else ['tagger','parser','ner','lemmatizer']
#     )
    
#     #show progress
#     iterator = tqdm(pipe_iterator,total = len(text),desc = "Cleaning(pipe)",unit="text") \
#     if show_progress else pipe_iterator
    
#     total_stats = {
#     "total_original_chars": 0,
#     "total_cleaned_chars": 0,
#     "total_removed_chars": 0,
#     "total_original_words": 0,
#     "total_cleaned_words": 0,
#     "removed_ratio": 0.0}
    
#     #main iteration
#     for doc in iterator:
#         original_text = doc.text
#         original_len = len(doc.text)
#         original_words = len(original_text.split())
        
#         if use_spacy_lemmatize:    #Token lemmatization
#             tokens = [
#                 token.lemma_
#                 for token in doc  #nlp.pipe() will tokenize the doc automatically
#                 if not token.is_punct
#                 and not token.is_space
#                 and token.text.strip()
#             ]
#         else:
#             tokens = word_tokenize(original_text.lower())
#             tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
#         #remove all stopwards
#         if remove_stopwords:
#             tokens = [t for t in tokens if t not in stop_words and len(t)>1]
#         cleaned = ' '.join(tokens)      #recombine all tokens
#         cleaned_len = len(cleaned)
#         cleaned_words = len(tokens)
        
#         stats = {'removed_chars':original_len - cleaned_len,
#                  "remove_ratio":round((original_len-cleaned_len)/original_len,2)\
#                  if original_len >0 else 0.0}
        
#         #sum of all cleaned
#         total_stats["total_original_chars"] += original_len
#         total_stats["total_cleaned_chars"] += cleaned_len
#         total_stats["total_removed_chars"] += stats["removed_chars"]
#         total_stats["total_original_words"] += original_words
#         total_stats["total_cleaned_words"] += cleaned_words
        
#         yield cleaned

#     if total_stats["total_original_chars"] > 0:
#         total_stats["removed_ratio"] = round(
#             total_stats["total_removed_chars"] / total_stats["total_original_chars"] * 100,
#             1
#         )
#     print("\n" + "‚ïê" * 60)
#     print("Text Cleaning Summary (Pipe Mode):")
#         # Handle case where texts is a generator (no len())
#     print(f"Total texts processed : {len(text):,}" if hasattr(text, '__len__') else "Unknown (streaming)")
#     print(f"Total original characters : {total_stats['total_original_chars']:,}")
#     print(f"Total cleaned characters : {total_stats['total_cleaned_chars']:,}")
#     print(f"Total characters removed : {total_stats['total_removed_chars']:,} ({total_stats['removed_ratio']}%)")
#     print(f"Total original words : {total_stats['total_original_words']:,}")
#     print(f"Total cleaned words : {total_stats['total_cleaned_words']:,}")
#     print("‚ïê" * 60 + "\n")

In [52]:
def clean_english_text(
    text,
    batch_size = 500, #less memory
    n_process  = 4,
    use_spacy_lemmatize:bool = False,
    remove_stopwords:bool = True,
    show_progress = True): #return iteration
    """Clean English text and return both cleaned result and statistics
    Use nltk because efficency
    """
    if isinstance(text,str):
        text = [text]
        
    if nlp is None:print('NLP no download')
    
    #show progress
    iterator = tqdm(text,total = len(text),desc = "Cleaning",unit="text") \
    if show_progress else text
    
    total_stats = {
    "total_original_chars": 0,
    "total_cleaned_chars": 0,
    "total_removed_chars": 0,
    "total_original_words": 0,
    "total_cleaned_words": 0,
    "removed_ratio": 0.0}
    
    #main iteration
    for original_text in iterator:
        original_len = len(original_text)
        original_words = len(original_text.split())
        
        #Word Lemmatization
        tokens = word_tokenize(original_text.lower())
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
        #remove all stopwards
        if remove_stopwords:
            tokens = [t for t in tokens if t not in stop_words and len(t)>1]
        cleaned = ' '.join(tokens)      #recombine all tokens
        cleaned_len = len(cleaned)
        cleaned_words = len(tokens)
        
        stats = {'removed_chars':original_len - cleaned_len,
                 "remove_ratio":round((original_len-cleaned_len)/original_len,2)\
                 if original_len >0 else 0.0}
        
        #sum of all cleaned
        total_stats["total_original_chars"] += original_len
        total_stats["total_cleaned_chars"] += cleaned_len
        total_stats["total_removed_chars"] += stats["removed_chars"]
        total_stats["total_original_words"] += original_words
        total_stats["total_cleaned_words"] += cleaned_words
        
        yield cleaned

    if total_stats["total_original_chars"] > 0:
        total_stats["removed_ratio"] = round(
            total_stats["total_removed_chars"] / total_stats["total_original_chars"] * 100,
            1
        )
    print("\n" + "‚ïê" * 60)
    print("Text Cleaning Summary (Pipe Mode):")
        # Handle case where texts is a generator (no len())
    print(f"Total texts processed : {len(text):,}" if hasattr(text, '__len__') else "Unknown (streaming)")
    print(f"Total original characters : {total_stats['total_original_chars']:,}")
    print(f"Total cleaned characters : {total_stats['total_cleaned_chars']:,}")
    print(f"Total characters removed : {total_stats['total_removed_chars']:,} ({total_stats['removed_ratio']}%)")
    print(f"Total original words : {total_stats['total_original_words']:,}")
    print(f"Total cleaned words : {total_stats['total_cleaned_words']:,}")
    print("‚ïê" * 60 + "\n")

In [49]:
test_texts = [
    "Check out my new video! https://youtu.be/dQw4w9WgXcQ üòéüî• @user123",
    "The quick brown fox jumps over the lazy dog!!!",
    "<p>Hello <b>world</b>!!!</p> Visit www.example.com for more info.",
    "I can't believe it's already 2026... unbelievable!!! ‚ù§Ô∏èüöÄ",
    "This is a very very long sentence with lots of the and is and are to test stopwords removal.",
    "",  
    "   \n\t   ",  
    "Python is awesome! Python is great! Python Python Python."  
]
test_texts = [pre_clean(i) for i in test_texts]
gen = clean_english_text(test_texts,n_process=1)
for exp_text in gen:
    print(exp_text)

Cleaning:   0%|          | 0/8 [00:00<?, ?text/s]

check new video httpsyoutu.bedqw4w9wgxcq user123
quick brown fox jump lazy dog
hello world visit info
ca n't believe 's already 2026 ... unbelievable
long sentence lot test stopwords removal


python awesome python great python python python

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 8
Total original characters : 339
Total cleaned characters : 234
Total characters removed : 105 (31.0%)
Total original words : 57
Total cleaned words : 36
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê



In [45]:
file_path = '/Users/fusiyuan/Desktop/HKU Courses/Text Analytics and Natural Language Processing/MFIN7036 Code and Data-20251210/my code/Group work/bitcoin_comments_sorted.parquet'
df = pd.read_parquet(file_path,columns=['datetime','body','score'], engine='pyarrow')

In [50]:
tqdm.pandas()
df_sample = df.head(100).copy()
df_sample['pre_cleaned'] = df_sample['body'].progress_apply(pre_clean)
pre_cleaned_list = list(df_sample['pre_cleaned'])
gen = clean_english_text(pre_cleaned_list,n_process=1,show_progress = True)
cleaned_results = []
for term in tqdm(gen,total = len(pre_cleaned_list),desc = 'Collecting Result'):    #End = len(pre_cleaned_list)
    cleaned_results.append(term)

df_sample['Cleaned_body'] = cleaned_results

print(df_sample[['body','pre_cleaned','Cleaned_body']][:10])

  0%|          | 0/100 [00:00<?, ?it/s]

Collecting Result:   0%|          | 0/100 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/100 [00:00<?, ?text/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100
Total original characters : 55,483
Total cleaned characters : 37,394
Total characters removed : 18,089 (32.6%)
Total original words : 9,413
Total cleaned words : 5,221
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

                                                body  \
0  Interesting, it uses IRC as a high level proto...   
1  No - the richest person will be the one with t...   
2  &gt;a public list of all the previous transact...   
3  No, that's not how bitcoin works, check out th...   
4  It's weird how Ron Paul gets money so incredib...   
5  Some reddit thought on Bitcoin [here](http://w...   
6  Scroll dow

In [53]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
import pandas as pd
import os
import gc

chunk_size = 100000
final_out_put_path = '/Users/fusiyuan/Desktop/HKU Courses/bitcoin_comments_sorted_cleaned_full.parquet'
output_dir = '/Users/fusiyuan/Desktop/HKU Courses/cleaned_chunks/'
os.makedirs(output_dir, exist_ok=True)  
total_rows = len(df)
total_chunks = (total_rows + chunk_size - 1)//chunk_size
for chunk_idx in tqdm(range(total_chunks),desc= 'Overall Progress(Chunks)',unit = 'chunk'):
    start = chunk_idx * chunk_size
    end = min(start + chunk_size,total_rows)
    chunk_path = f"{output_dir}chunk_{chunk_idx+1:03d}.parquet"
    
    print('Pre cleaned')
    df_chunk = df.iloc[start:end].copy()
    df_chunk['pre_cleaned'] = df_chunk['body'].progress_apply(pre_clean)
    
    pre_cleaned_list = df_chunk['pre_cleaned'].tolist()
    
    print('Spacy clean')
    gen = clean_english_text(pre_cleaned_list,show_progress=False)
    cleaned_results = []
    for term in tqdm(gen,total = len(pre_cleaned_list),desc = 'Collecting Result',dynamic_ncols=False):
        cleaned_results.append(term)#End = len(pre_cleaned_list)
    df_output_chunk = df_chunk[['datetime', 'score']].copy()
    df_output_chunk['cleaned_body'] = cleaned_results
    
    df_output_chunk.to_parquet(chunk_path,engine='pyarrow',index = False)
    print(f"This chunk finish:{start} - {end} rows;")
    
    del df_chunk, pre_cleaned_list, cleaned_results, df_output_chunk
    gc.collect()

Overall Progress(Chunks):   0%|          | 0/79 [00:00<?, ?chunk/s]

Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 48,375,742
Total cleaned characters : 32,005,042
Total characters removed : 16,370,700 (33.8%)
Total original words : 8,403,265
Total cleaned words : 4,621,360
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:0 - 100000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 45,892,662
Total cleaned characters : 30,328,418
Total characters removed : 15,564,244 (33.9%)
Total original words : 7,958,793
Total cleaned words : 4,378,374
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:100000 - 200000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 43,147,762
Total cleaned characters : 28,352,214
Total characters removed : 14,795,548 (34.3%)
Total original words : 7,549,764
Total cleaned words : 4,133,887
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:200000 - 300000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 43,431,847
Total cleaned characters : 28,622,801
Total characters removed : 14,809,046 (34.1%)
Total original words : 7,572,709
Total cleaned words : 4,161,768
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:300000 - 400000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 44,785,243
Total cleaned characters : 29,532,523
Total characters removed : 15,252,720 (34.1%)
Total original words : 7,790,598
Total cleaned words : 4,290,219
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:400000 - 500000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 41,687,207
Total cleaned characters : 27,629,289
Total characters removed : 14,057,918 (33.7%)
Total original words : 7,229,670
Total cleaned words : 4,006,002
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:500000 - 600000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 43,257,497
Total cleaned characters : 28,829,279
Total characters removed : 14,428,218 (33.4%)
Total original words : 7,449,525
Total cleaned words : 4,141,546
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:600000 - 700000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 35,073,693
Total cleaned characters : 24,264,856
Total characters removed : 10,808,837 (30.8%)
Total original words : 5,787,242
Total cleaned words : 3,337,598
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:700000 - 800000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 37,024,217
Total cleaned characters : 25,440,756
Total characters removed : 11,583,461 (31.3%)
Total original words : 6,161,856
Total cleaned words : 3,528,900
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:800000 - 900000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 40,201,693
Total cleaned characters : 27,135,009
Total characters removed : 13,066,684 (32.5%)
Total original words : 6,853,356
Total cleaned words : 3,866,256
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:900000 - 1000000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 43,997,450
Total cleaned characters : 29,625,858
Total characters removed : 14,371,592 (32.7%)
Total original words : 7,517,443
Total cleaned words : 4,222,857
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:1000000 - 1100000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Text Cleaning Summary (Pipe Mode):
Total texts processed : 100,000
Total original characters : 54,410,009
Total cleaned characters : 37,367,973
Total characters removed : 17,042,036 (31.3%)
Total original words : 9,037,400
Total cleaned words : 5,160,152
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

This chunk finish:1100000 - 1200000 rows;
Pre cleaned


  0%|          | 0/100000 [00:00<?, ?it/s]

Spacy clean


Collecting Result:   0%|          | 0/100000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [18]:
print("nlp ÊòØÂê¶Âä†ËΩΩÊàêÂäüÔºü", nlp is not None)
print("nlp Á±ªÂûã:", type(nlp) if nlp is not None else "None")

nlp ÊòØÂê¶Âä†ËΩΩÊàêÂäüÔºü True
nlp Á±ªÂûã: <class 'spacy.lang.en.English'>


In [55]:
def pre_clean_vader(text):
    """Pre-clean function: 
    like removing HTML tag; Emoji and special punctuation"""
    if not isinstance(text,str) or not text.strip(): #if text is str format or only whitespace
        return ""
    # remove html tag
    text = BeautifulSoup(text,'html.parser').get_text() 
    
    #vader could identify the emoji
    # remove emoji to null string 
    # text = emoji.replace_emoji(text,replace = '')
    
    #remove special character string
    text = re.sub(r'[^\w\s.,!?\'"-]', '', text)
    #we define legal character,any character not in list will be removed
    
    #remove several whitespace/tab to single whitespace
    text = re.sub(r'\s+',' ',text.strip())
    
    #remove several whitespace/tab before puncuation
    text = re.sub(r'\s+([.,!?])',r'\1',text)
    text = re.sub(r'\s+',' ',text.strip())
    
    text = re.sub(r'www.\w+.com','',text)
    
    return text

In [56]:
output_file_path_vader = '/Users/fusiyuan/Desktop/HKU Courses/Text Analytics and Natural Language Processing/MFIN7036 Code and Data-20251210/my code/data_cleaned_vader.parquet'
df_sample = df.copy()
df_sample['cleaned_body'] = df_sample['body'].progress_apply(pre_clean_vader)
df_clean_vader = df_sample[['datetime','cleaned_body','score']]
df_clean_vader.to_parquet(output_file_path_vader,engine= 'pyarrow',index = False)
print("Finished")

  0%|          | 0/7862669 [00:00<?, ?it/s]

Finished
