## BBC 기사를 크롤링하는 파이썬 코드
- https://www.bbc.com 기사를 크롤링합니다.
- 웹사이트를 돌아다니며 자동으로 기사 링크를 수집합니다.
- 크롤링 결과는 bbc-out.txt 파일에 저장됩니다.

In [1]:
from google.colab import drive
drive.mount('/content/mnt')

Drive already mounted at /content/mnt; to attempt to forcibly remount, call drive.mount("/content/mnt", force_remount=True).


### 1. 데이터셋 크롤링

In [21]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import re
import time
import os


if not os.path.exists('/content/mnt/My Drive/bbc-urls.txt'):
    print('init bbc-urls.txt')
    with open('/content/mnt/My Drive/bbc-urls.txt', 'w', encoding='utf8') as f:
        f.write('https://www.bbc.com/')

if not os.path.exists('/content/mnt/My Drive/bbc-visited.txt'):
    print('init bbc-visited.txt')
    with open('/content/mnt/My Drive/bbc-visited.txt', 'w', encoding='utf8') as f:
        f.write('')

def is_visited(url):
    # 방문했던 주소인지 여부 반환
    with open('/content/mnt/My Drive/bbc-visited.txt', 'r', encoding='utf8') as f:
        raw = f.read()
    return url in raw


def get_soup(url):
    # html 요청
    response = requests.get(url)

    if response.status_code != 200:
        print('!!!', '[', response.status_code, ']', '\t', url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    with open('/content/mnt/My Drive/bbc-visited.txt', 'a+', encoding='utf8') as f:
        f.write('\n' + url)
    return soup


def is_true_url(url):
    # 목표 cnn article 주소인지 여부 반환
    # return re.search('https\:\/\/www.cnn.com\/\d{4}\/\d{2}\/\d{2}\/.+', url)
    #return '.usatoday.com/' in url and '//' in url and 'twitter.com' not in url and 'facebook.com' not in url and 'onlinestore.usatoday.com' not in url
    return 'www.bbc.co' in url and 'learningenglish' not in url


def get_article(soup):
    # pure article 추출 및 저장
    results = []
    tmp = ''
    for child in soup.select('article > div'):

        if child.attrs.get('data-component') in ['text-block']:
            lower_text = child.text.lower()
            if lower_text.startswith('read more from'):
                break
            if ('facebook' in lower_text or 'instagram' in lower_text or 'twitter' in lower_text) and '@' in lower_text and 'follow' in lower_text:
                break
            tmp += '\n' + child.text
        elif child.attrs.get('data-component') in ['crosshead-block']:
            results.append(tmp)
            tmp = ''

    if tmp != '':
        results.append(tmp)

    with open('/content/mnt/My Drive/bbc-out.txt', 'a+', encoding='utf8') as f:
        f.write('\n\n[SEP]\n\n' + '\n\n[SEP]\n\n'.join(results))


def get_candidate_urls(base_url, soup):
    # 다음 방문할 주소를 반환한다.
    # 이미 방문했던 주소는 제외
    urls = [urljoin(base_url, link.get('href')) for link in soup.find_all('a')]
    urls = [url for url in urls if url is not None]
    urls = [url for url in urls if is_true_url(url)]
    urls = [url for url in urls if not is_visited(url)]
    urls = ['https:' + url if url.startswith('//') else url for url in urls]
    return urls


def do(urls):
    urls = list(set(urls))
    urls = [url for url in urls if is_true_url(url)]
    url = urls[0]
    print(url)
    urls = urls[1:]

    if is_visited(url):
        return urls

    if len(urls) <= 500 or (re.search(r'\d{8}$', url) and 'news/' in url and 'bbc.co' in url):
        try:
            soup = get_soup(url)
        except Exception as e:
            print('==get_soup(url)==')
            print(e)
            return urls
        if re.search(r'\d{8}$', url) and 'news/' in url and 'bbc.co' in url:
            try:
                print('articling..')
                get_article(soup)
            except Exception as e:
                print(e)
        if len(urls) <= 500:
            urls.extend(get_candidate_urls(url, soup))
    urls = list(set(urls))
    return urls


for _ in range(1000):
    while True:
        try:
            with open('/content/mnt/My Drive/bbc-urls.txt', 'r', encoding='utf8') as f:
                urls = f.read().split('\n')
            break
        except OSError as e:
            print('OSError', e, '\tbbc-url.txt r in while true')
            time.sleep(5)

    urls = do(urls)
    while True:
        try:
            with open('/content/mnt/My Drive/bbc-urls.txt', 'w', encoding='utf8') as f:
                f.write('\n'.join(urls))
            break
        except OSError as e:
            print('OSError', e, '\tbbc-url.txt w in while true')
            time.sleep(5)


init bbc-urls.txt
init bbc-visited.txt
https://www.bbc.com/
https://www.bbc.co.uk/contact
https://www.bbc.com/persian/
http://www.bbc.co.uk/cbbc/topics/cbbc-help
http://www.bbc.co.uk/cbbc/findoutmore/web-help-general-web-faqs?collection=cbbc-help
http://www.bbc.co.uk/cbeebies/shows/octonauts
https://www.bbc.co.uk/contact#Radio
https://www.bbc.com/news/world-us-canada-40360953
articling..
https://www.bbc.co.uk/aboutthebbc
https://www.bbc.com/news/newsbeat-57977584
articling..
http://www.bbc.co.uk/news/20039682
articling..
https://www.bbc.co.uk/aboutthebbc/governance/charter
https://www.bbc.com/news/uk-england-dorset-58015659
articling..
https://www.bbc.com/culture/article/20210729-ten-films-to-watch-this-august
https://www.bbc.com/news
https://www.bbc.com/sport/formula1
http://www.bbc.co.uk/cbeebies/joinin
https://www.bbc.com/news/av/world-us-canada-40360953
articling..
https://www.bbc.com/news/technology-58027356
articling..
http://www.bbc.co.uk/cbeebies/shows/bluey
https://www.bbc.co.

In [22]:
!head '/content/mnt/My Drive/bbc-out.txt' -n 100



[SEP]



[SEP]


Team GB's swimming heroics at Tokyo 2020 might have inspired you to get off the sofa, grab your goggles and do a few lengths of your local pool.
But what separates an Olympic champion from us mere mortals? 
According to the performance psychologist for gold medallist Adam Peaty, the answer is mental strength. 
Bill Beswick's worked with the 26-year-old breaststroke machine since he was 17 and says Adam's winning mindset is the best he's seen. 
Bill tells Radio 1 Newsbeat there are two major factors to becoming a champion: being physically capable and having the attitude "to compete, train and win everyday".
"You can get away with less talent and a great attitude but you can't get away with a bad attitude."
Adam, from Uttoxeter, Staffordshire, is one of the fastest swimmers in history, and won his first Olympic title at Rio 2016.
No-one has even come close to his world record time of 56.88 seconds.
Adam follows a mental ritual to get in the zone ahead of his 6am and 6

### 2. 데이터셋 제작
- 뉴스 기사를 전처리합니다.
- bbc-dataset/ 디렉토리에 텍스트 파일로 저장합니다.

In [30]:
import os


if not os.path.isdir('/content/mnt/My Drive/bbc-dataset'):
    os.mkdir('/content/mnt/My Drive/bbc-dataset')


total = 0
tmp_n = 0
file_name = 0
samples = []

out_path = '/content/mnt/My Drive/bbc-out.txt'
if os.path.isfile(out_path):
    with open(out_path, 'r', encoding='utf8') as f:
        raw = f.read()
    for doc in raw.split('[SEP]'):
        doc = doc.split('— CNBC')[0].split('— CNN')[0].replace('’', "'")
        while doc.endswith('\n'):
            doc = doc[:-1]
        lines = doc.split('\n')
        if '@' in lines[-1] and '.com' in lines[-1]:
            lines = lines[:-1]
        doc = '\n'.join(lines)
        doc = doc.replace('\n', ' ')
        while '  ' in doc:
            doc = doc.replace('  ', ' ')
        while doc.startswith(' '):
            doc = doc[1:]

        length = doc.count(' ')
        if 100 <= length <= 800:
            h = doc[:64] + str(len(doc)) + doc[-64:]
            if h in samples:
                continue
            samples.append(h)
            tmp_n += 1
            with open(f"/content/mnt/My Drive/bbc-dataset/{file_name}.txt", 'a+', encoding='utf8') as f:
                if tmp_n == 1:
                    f.write(doc)
                else:
                    f.write('\n'+doc)
            if tmp_n >= 1000:
                file_name += 1
                tmp_n = 0
            total += 1
            print(total, end='\r')

print('-----------------------------------')
print('total', '\t', total)


-----------------------------------
total 	 228
