In [1]:
from os import listdir
from bs4 import BeautifulSoup
import re
from datetime import datetime


def load_text(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

def split_story(doc):
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

def get_date(filename_str, download_directory):
    filename = download_directory + "/" + filename_str + '.html'
    file = open(filename, 'rb')
    html = file.read()
    soup = BeautifulSoup(html, "lxml")
    pattern = re.compile(
    "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|"
    "Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|"
    "Dec(ember)?)\s+\d{1,2},\s+20(07|08|09|10|11|12|13|14|15)")
    try:
        date = pattern.search(str(soup)).group()
    except:
        date = None
    file.close()
    return date

def load_stories(story_directory, download_directory):
    stories = {}
    initial = datetime.now()
    initial_time = initial.strftime("%H:%M:%S")
    print(f'Beginning Extraction at {initial_time}')
    for i,name in enumerate(listdir(story_directory)):
#         if i == 5000:
#             break
        if i % 1000 == 0:
            now = datetime.now()
            e_time = f'{(now-initial).seconds//60}M:{(now-initial).seconds%60}S'
            print(f'ET {e_time} - Currently Loaded {i} Stories ({(i/len(listdir(story_directory))*100):.2f}% Complete)')
        filename = story_directory + "/" + name
        text = load_text(filename)
        story, highlights = split_story(text)
        filename_str = name.strip('.story')
        date = get_date(filename_str, download_directory)
        stories[filename_str] = {'date': date, 'story':story, 'highlights':highlights}
    
    finished = datetime.now()
    tot_time = f'{(finished-initial).seconds//60}M:{(finished-initial).seconds%60}S'
    print(f'Total Loaded Stories {len(stories)} in {tot_time}')
    return stories

In [2]:
story_directory = './cnn/stories'
download_directory = './cnn/downloads'
cnn_stories = load_stories(story_directory, download_directory)

Beginning Extraction at 11:02:36
ET 0M:0S - Currently Loaded 0 Stories (0.00% Complete)
ET 0M:35S - Currently Loaded 1000 Stories (1.08% Complete)
ET 1M:10S - Currently Loaded 2000 Stories (2.16% Complete)
ET 1M:42S - Currently Loaded 3000 Stories (3.24% Complete)
ET 2M:16S - Currently Loaded 4000 Stories (4.32% Complete)
ET 2M:52S - Currently Loaded 5000 Stories (5.40% Complete)
ET 3M:31S - Currently Loaded 6000 Stories (6.48% Complete)
ET 4M:8S - Currently Loaded 7000 Stories (7.56% Complete)
ET 4M:44S - Currently Loaded 8000 Stories (8.64% Complete)
ET 5M:22S - Currently Loaded 9000 Stories (9.72% Complete)
ET 6M:1S - Currently Loaded 10000 Stories (10.80% Complete)
ET 6M:38S - Currently Loaded 11000 Stories (11.88% Complete)
ET 7M:14S - Currently Loaded 12000 Stories (12.96% Complete)
ET 7M:53S - Currently Loaded 13000 Stories (14.04% Complete)
ET 8M:31S - Currently Loaded 14000 Stories (15.12% Complete)
ET 9M:8S - Currently Loaded 15000 Stories (16.20% Complete)
ET 9M:46S - Curren

In [3]:
import pandas as pd

cnn_stories_df = pd.DataFrame.from_dict(cnn_stories, orient='index')

In [4]:
cnn_stories_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92579 entries, 638ba1352bdf405a8f5bd681d7fe5c928686afff to 818fcac70cccea5a042a0f44eef23cd6c3e415b3
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        92140 non-null  object
 1   story       92579 non-null  object
 2   highlights  92579 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


In [5]:
from pickle import dump
dump(cnn_stories, open('cnn_stories.pkl', 'wb'))
dump(cnn_stories_df, open('cnn_stories_df.pkl', 'wb'))