In [10]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [11]:
url = 'https://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-h.html'
r = requests.get(url)

In [12]:
soup = BeautifulSoup(r.content, 'html.parser')

In [13]:
clean_p = [p.get_text() for p in soup.findAll('p')]

In [14]:
def chunk_p(para, max_chunk_size):
    """Takes a string, returns multiple strings of under 270 char each"""
    
    chunk = []
    list_of_chunks = []
    chunk_char_length = 0
    words_in_para = para.split()
    
    if len(para) < max_chunk_size:
        return [para]

    for n, w in enumerate(para.split()):
        chunk.append(w)
        chunk_char_length += (len(w) + 1)

        if chunk_char_length > max_chunk_size - 5:
            list_of_chunks.append(chunk)
            chunk = []
            chunk_char_length = 0
            continue

        elif n + 1 == len(words_in_para):
            list_of_chunks.append(chunk)

        else:
            pass
    
    chunked = [' '.join(c) for c in list_of_chunks]
    
    return chunked
    

In [15]:
# dataframe by paragraph
df = pd.DataFrame({'text': clean_p})
df['text'] = df['text'].str.strip('\n')
df['char_count'] = df['text'].str.len()
df['over_twitter_length'] = df['char_count'] > 280
df['p_num'] = df.index

In [16]:
# chunk text string to smaller sizes
df['text_chunked'] = df['text'].apply(chunk_p, max_chunk_size = 220).to_list()
# count chunks in each
df['chunk_tot'] = [len(l) for l in df['text_chunked']]
# expand each element in chunk column to its own row, join back to original row's data
df = df.explode('text_chunked')
# add chunk number in paragraph
df['chunk_num'] = df.groupby(['p_num']).cumcount()+1
# reset index
df.reset_index(inplace=True, drop=True)
# add x/n for threads and hashtag
text_to_tweet = []
for text, chunk_num, chunk_tot in zip(df['text_chunked'], df['chunk_num'], df['chunk_tot']):
    if chunk_tot > 1:
        text_to_tweet.append(f'{text} {chunk_num}/{chunk_tot} #hemingway')
    else:
        text_to_tweet.append(f'{text} \n\n#hemingway')
df['text_to_tweet'] = text_to_tweet
# char count of text_to_tweet
df['text_to_tweet_char_count'] = df['text_to_tweet'].str.len()
# timestamp of tweet
df['tweeted_timestamp'] = ''

In [19]:
df.to_csv('data/old_man_and_the_sea.csv', index=False)
# only run the ts version once, could overwrite the ts list
df.to_csv('data/old_man_and_the_sea_timestamps.csv', index=False)

In [18]:
df.head(50)

Unnamed: 0,text,char_count,over_twitter_length,p_num,text_chunked,chunk_tot,chunk_num,text_to_tweet,text_to_tweet_char_count,tweeted_timestamp
0,ERNEST HEMINGWAY,16,False,0,ERNEST HEMINGWAY,1,1,ERNEST HEMINGWAY \n\n#hemingway,29,
1,"CHARLES SCRIBNER'S SONS, NEW YORK\n\n1952",39,False,1,"CHARLES SCRIBNER'S SONS, NEW YORK\n\n1952",1,1,"CHARLES SCRIBNER'S SONS, NEW YORK\n\n1952 \n\n...",52,
2,"COPYRIGHT, 1952, BY\n\nERNEST HEMINGWAY\n\nPri...",203,False,2,"COPYRIGHT, 1952, BY\n\nERNEST HEMINGWAY\n\nPri...",1,1,"COPYRIGHT, 1952, BY\n\nERNEST HEMINGWAY\n\nPri...",216,
3,TO CHARLIE SCRIBNER\n\nAND\n\nTO MAX PERKINS,40,False,3,TO CHARLIE SCRIBNER\n\nAND\n\nTO MAX PERKINS,1,1,TO CHARLIE SCRIBNER\n\nAND\n\nTO MAX PERKINS \...,53,
4,,0,False,4,,1,1,\n\n#hemingway,13,
5,THE OLD MAN AND THE SEA,23,False,5,THE OLD MAN AND THE SEA,1,1,THE OLD MAN AND THE SEA \n\n#hemingway,36,
6,He was an old man who fished alone in a skiff ...,742,True,6,He was an old man who fished alone in a skiff ...,4,1,He was an old man who fished alone in a skiff ...,234,
7,He was an old man who fished alone in a skiff ...,742,True,6,parents had told him that the old man was now ...,4,2,parents had told him that the old man was now ...,232,
8,He was an old man who fished alone in a skiff ...,742,True,6,the boy sad to see the old man come in each da...,4,3,the boy sad to see the old man come in each da...,232,
9,He was an old man who fished alone in a skiff ...,742,True,6,"patched with flour sacks and, furled, it looke...",4,4,"patched with flour sacks and, furled, it looke...",97,
