In [14]:
import re
from bs4 import BeautifulSoup

# Function to fix encoding issues
def fix_encoding(text):
    try:
        decoded_text = text.encode('utf-8').decode('utf-8')
    except UnicodeDecodeError:
        decoded_text = 'Encoding Error'
    return decoded_text

# Function to remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Function to remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Function to remove emails
def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

# Function to remove non-ASCII characters
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

# Function to remove long sequences of non-word characters
def remove_long_nonsense_sequences(text):
    return re.sub(r'\b\w{20,}\b', '', text)

# Function to remove backslash followed by single quote and two digits or letters
def remove_special_text(text):
    return re.sub(r"\\'\w{2}", '', text)

# Function to remove long whitespace sequences
def remove_long_whitespace(text):
    return re.sub(r'\s{2,}', ' ', text)

# Function to clean text
def clean_text_pipeline(text):
    text = fix_encoding(text)
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_non_ascii(text)
    text = remove_long_nonsense_sequences(text)
    text = remove_special_text(text)
    text = remove_long_whitespace(text)
    return text

# 从文本文件中读取内容
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# 将清理后的内容写入新的文本文件
def write_text_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# 示例使用
input_file_path = "C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly/April_2022.txt"
output_file_path = "C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text/April_2022.txt"

# 从文件读取待清理的文本
raw_text = read_text_file(input_file_path)

# 清理文本
cleaned_text = clean_text_pipeline(raw_text)

# 将清理后的文本写入新的文件
write_text_file(output_file_path, cleaned_text)

# 输出前100个单词
words = cleaned_text.split()
first_100_words = ' '.join(words[:100])
print(first_100_words)


USA Today Online USATONL English Copyright 2022 USA Today Online. Provided by ProQuest Information and Learning. All rights reserved. U.S. employers added a booming 431,000 jobs in March as tumbling COVID-19 cases more than offset concerns about soaring inflation and the war in Ukraine. The unemployment rate fell from 3.8% to 3.6%, the Labor Department said Friday. That puts it just above the 50-year low of 3.5% just before the pandemic upended the economy in March 2020. Economists surveyed by Bloomberg had estimated that 440,000 jobs were added last month. The economy has now added more than 400,000 jobs a


In [16]:
import re
from bs4 import BeautifulSoup
import os

# Function to fix encoding issues
def fix_encoding(text):
    try:
        decoded_text = text.encode('utf-8').decode('utf-8')
    except UnicodeDecodeError:
        decoded_text = 'Encoding Error'
    return decoded_text

# Function to remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Function to remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Function to remove emails
def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

# Function to remove non-ASCII characters
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

# Function to remove long sequences of non-word characters
def remove_long_nonsense_sequences(text):
    return re.sub(r'\b\w{20,}\b', '', text)

# Function to remove backslash followed by single quote and two digits or letters
def remove_special_text(text):
    return re.sub(r"\\'\w{2}", '', text)

# Function to remove long whitespace sequences
def remove_long_whitespace(text):
    return re.sub(r'\s{2,}', ' ', text)

# Function to clean text
def clean_text_pipeline(text):
    text = fix_encoding(text)
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_non_ascii(text)
    text = remove_long_nonsense_sequences(text)
    text = remove_special_text(text)
    text = remove_long_whitespace(text)
    return text


# 从文本文件中读取内容
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# 将清理后的内容写入新的文本文件
def write_text_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# 清理文件夹中的所有文本文件
def clean_text_files_in_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)
            
            # 从文件读取待清理的文本
            raw_text = read_text_file(input_file_path)
            print(f"Processing file: {input_file_path}")
            
            # 清理文本
            cleaned_text = clean_text_pipeline(raw_text)
            
            # 将清理后的文本写入到新的文件中
            write_text_file(output_file_path, cleaned_text)
            print(f"Saved cleaned file: {output_file_path}")

# 示例使用
input_folder = "C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly"
output_folder = "C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text"

clean_text_files_in_folder(input_folder, output_folder)

Processing file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly\April_2020.txt
Saved cleaned file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text\April_2020.txt
Processing file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly\April_2021.txt
Saved cleaned file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text\April_2021.txt
Processing file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly\April_2022.txt
Saved cleaned file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text\April_2022.txt
Processing file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly\April_2023.txt
Saved cleaned file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text\April_2023.txt
Processing file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/News/monthly\August_2020.txt
Saved cleaned file: C:/Users/22749/Desktop/UoG/Fintech/Dissertation/Data/Cleaned_text\August_2020.txt
Pro