# Scraping Wikipedia page

In [5]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect_langs
import re
import requests


In [6]:
# Load list of inappropriate words
with open('en.txt', 'r') as file:
    bad_words = set(file.read().splitlines())

In [7]:
def check_english(text):
    try:
        probabilities = detect_langs(text)
        for language in probabilities:
            if language.lang == 'en' and language.prob > 0.99:
                return True, ''
        return False, 'Text is not in English or confidence is below 99%.'
    except Exception as e:
        return False, f'Language detection failed: {e}'

In [8]:
def check_cleanliness(text):
    words_in_text = set(re.findall(r'\b\w+\b', text.lower()))
    for word in bad_words:
        if word in words_in_text:
            return False, f'Text contains inappropriate whole word: {word}'
    return True, ''

In [9]:
def crawl_wikipedia(term):

    ############ data acquisition #######################
    # compose the url for the search term
    url = f"https://en.wikipedia.org/wiki/{term}"

    # get the response from the server
    response = requests.get(url)

    ############# text extraction #######################
    soup = BeautifulSoup(response.content, 'html.parser')
    
    errors = []
    text = ""
    
    for paragraph in soup.find_all('p'):
        # count the number of words in each paragraph
        words = paragraph.get_text().strip().split()
        # if the paragraph has less than 5 words, skip it
        if len(words) < 5:
            continue
        
        text  = text + paragraph.get_text().strip() + "\n"
    
    ############# bracket filter #######################
    text = re.sub(r'\[.*?\]', '', text)

    ############# word number filter ###################
    sentences = text.split(".")
    sentences = [sentence + "." for sentence in sentences]

    for sentence in sentences:
        if len(sentence.split()) < 3:
            # remove sentences that are shorter than 3 words
            text = text.replace(sentence, "")

    ############# sentence number filter #################
    if len(text.strip().split(".")) < 10:
        errors.append("Document has less than 10 sentences.")
   
    ############# language filter ########################
    is_english, error_msg = check_english(text)
    if not is_english:
        errors.append(f'Language Error: {error_msg}')

    ############# bad word filter ########################
    is_clean, error_msg = check_cleanliness(text)
    if not is_clean:
        errors.append(f'Cleanliness Error: {error_msg}')

    return text, errors

In [None]:
# define the term to be searched
term = "car"
data, error_messages = crawl_wikipedia(term)

# appand the data to a XML file with index number, text, errors and website
with open("wiki.xml", "a") as file:
    file.write(f"<document>\n<id>{term}</id>\n<text>{data}</text>\n<errors>{error_messages}</errors>\n<website>https://en.wikipedia.org/wiki/{term}</website>\n</document>\n")


# Print errors
for error in error_messages:
    print(error)