In [1]:
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm, trange
import time
import pickle
import datetime
import pandas as pd

In [2]:
def get_text_by_id(id_):
    url = 'https://archiveofourown.org/works/{}?view_full_work=true'
    headers = {'user-agent': 'bot (sj784@cornell.edu)'}
    
    r = requests.get(url.format(id_), headers=headers)
    soup = BeautifulSoup(r.text)
    
    assert soup != None
        
    stats = soup.find('dd', class_='stats')
    
    assert stats != None
    
    published_date = stats.find('dl', class_='stats').find('dd', class_='published').get_text() \
        if stats.find('dd', class_='published') else ''
    
    output = ''
    if soup.find_all('div', class_="userstuff module", role='article'):
        for chapter in soup.find_all('div', class_="userstuff module", role='article'):
            output += chapter.get_text()+'\n'
    
    elif soup.find('div', class_='userstuff'):
        output += soup.find('div', class_='userstuff').get_text()
    
    else:
        # print(f'can not find text for {id_}')
        raise AssertionError
        
    return output, published_date

def extract_info(id_, html):
    stats = html.find('dl', class_='stats')
    text, published_date = get_text_by_id(id_)
    return {
        'id': id_, # id
        'title': html.find('h4', class_='heading').get_text().split('\n')[1], # title
        'author': html.find('h4', class_='heading').get_text().split('\n')[-2],
        'rating': html.find_all('span', class_="text")[0].get_text(), # rating
        'fandoms': [tag.get_text() for tag in html.find('h5', class_='fandoms heading').find_all('a')], # fandoms
        'tags': [tag.get_text() for tag in html.find_all('li', class_='freeforms')], # tags
        'warning': html.find_all('span', class_="text")[1].get_text(), # warning
        'pairing': html.find_all('span', class_="text")[2].get_text(), # pairing,
        'comments': stats.find('dd', class_='comments').get_text() if stats.find('dd', class_='comments') else 0,
        'kudos': stats.find('dd', class_='kudos').get_text() if stats.find('dd', class_='kudos') else 0,
        'hits': stats.find('dd', class_='hits').get_text(),
        'relationships': [tag.get_text() for tag in html.find_all('li', class_='relationships')],
        'characters': [tag.get_text() for tag in html.find_all('li', class_='characters')],
        'summary': html.find('blockquote', class_='userstuff summary').get_text() if html.find('blockquote', class_='userstuff summary') else '',
        'text': text, # content
        'published_date': published_date,
        'timestamp': datetime.datetime.now()
    }
    

def is_valid(html):
    status = html.find_all('span', class_="text")[-1].get_text()
    
    stats = html.find('dl', class_='stats')
    words_count = stats.find('dd', class_='words').get_text().replace(',','')
    
    if not words_count:
        return False
    
    language = stats.find('dd', class_='language').get_text() 
    
    return status == 'Complete Work' and language == 'English' \
        and int(words_count) > 1000 and int(words_count) < 25000

    
def get_ids_by_tag(tag, num_required, existed, page_offset=0):
    url = 'https://archiveofourown.org/tags/{}/works?page={}'
    headers = {'user-agent': 'bot (sj784@cornell.edu)'}
    output = []
    
    page = 1 + page_offset
    delay = 300
    
    
    while True:
        page_count = 0

        r = requests.get(url.format(tag, page), headers=headers)
        soup = BeautifulSoup(r.text)
        
        try:
            error_cyle = 0
            
            works = soup.find_all('ol', class_='work index group')
            assert len(works) > 0
            
            for fanfic in works[0].find_all('li', role='article'):
                id_ = fanfic.find('h4', class_='heading').find('a', href=True)['href'].split('/')[-1]
                
                if id_ in existed:
                    continue
                if not is_valid(fanfic): 
                    continue
                
                output.append(extract_info(id_, fanfic))
                page_count += 1

            print(f'Scraped {page_count} from page {page}. Total: {len(output)}')
            
            if len(output) >= num_required:
                break 
            
            page += 1
        
        except:
            print(f'Timed out. Will try again in {delay} sec')
            error_cyle += 1
            
            if error_cyle > 3:
                print('Waited too long. Exit')
                return output
            
            time.sleep(delay)
            page += 5
        

    
    print(f'Job complete. Scraped {len(output)} [{tag}] fanfics in total. Exit')
    return output

In [2]:
existed_df = pickle.load(open('data/ao3_db.p', 'rb'))
old_ids = set(existed_df['id'].tolist())
len(old_ids)

97278

In [5]:
existed_df = pickle.load(open('data/ao3_ver1_full.p', 'rb'))
old_ids2 = set(existed_df['id'].tolist())

In [7]:
common = old_ids & old_ids2
in_old_not_in_new = old_ids2 - old_ids

In [9]:
u = old_ids | old_ids2

In [10]:
len(u)

148106

In [None]:
with open('data/corpus.line', 'rb')

In [None]:
scraped_ids = set()
data = []
for iter_ in range(8):
    print('Iteration', iter_+1)
    for tag in ['Slow%20Build', 'Violence', 'Love', 'Family',
                'Friendship', 'Relationship(s)', 
                'Deviates%20From%20Canon', 'Humor']:
        old_ids |= scraped_ids
        data += get_ids_by_tag(tag, 100, old_ids, 1000 + iter_ * 100)
        scraped_ids = set([d['id'] for d in data])
    print(f'Scraped {len(data)} so far')

Iteration 1
Scraped 1 from page 1001. Total: 1
Scraped 0 from page 1002. Total: 1
Scraped 0 from page 1003. Total: 1
Scraped 1 from page 1004. Total: 2
Scraped 0 from page 1005. Total: 2
Scraped 0 from page 1006. Total: 2
Scraped 1 from page 1007. Total: 3
Scraped 2 from page 1008. Total: 5
Scraped 1 from page 1009. Total: 6
Scraped 2 from page 1010. Total: 8
Scraped 4 from page 1011. Total: 12
Scraped 2 from page 1012. Total: 14
Scraped 1 from page 1013. Total: 15
Scraped 4 from page 1014. Total: 19
Scraped 3 from page 1015. Total: 22
Scraped 1 from page 1016. Total: 23
Scraped 1 from page 1017. Total: 24
Scraped 1 from page 1018. Total: 25
Scraped 3 from page 1019. Total: 28
Scraped 4 from page 1020. Total: 32
Scraped 2 from page 1021. Total: 34
Scraped 1 from page 1022. Total: 35
Scraped 2 from page 1023. Total: 37
Scraped 1 from page 1024. Total: 38
Scraped 0 from page 1025. Total: 38
Scraped 2 from page 1026. Total: 40
Scraped 0 from page 1027. Total: 40
Scraped 0 from page 1028. 

In [60]:
len(data)

3872

In [61]:
def add_scraped_to_df(df, scraped_raw):
    scraped_df = pd.DataFrame(scraped_raw)
    mask = scraped_df['text'].str.len() > 1000
    scraped_df = scraped_df.loc[mask]
    return df.append(scraped_df, ignore_index=True)

In [62]:
existed_df = add_scraped_to_df(existed_df, data)

In [63]:
len(existed_df)

97878

In [64]:
existed_df

Unnamed: 0,id,title,author,rating,fandoms,tags,warning,pairing,comments,kudos,hits,relationships,characters,summary,text,published_date,timestamp
0,31166018,"Don't Worry, You Are More Than Enough;",Bang_Daddy_Chan,Not Rated,[Stray Kids (Band)],"[Hwang Hyunjin is a Mess, Hwang Hyunjin is a P...","Choose Not To Use Archive Warnings, No Archive...","M/M, Multi, Other",4,52,502,"[Hwang Hyunjin/Yang Jeongin | I.N, Bang Chan/H...","[Hwang Hyunjin, Yang Jeongin | I.N, Lee Minho ...","\nThis spot where Hyunjin was now, used to be ...",Hyunjin couldn't focus. It wasn't something th...,2021-05-08,2021-05-15 18:01:46.777424
1,28520001,About That Kind of Desire...,Mina_chan95,Mature,[King of Fighters],"[you know I had to do it to 'em, Kyo didn't le...",Graphic Depictions Of Violence,Gen,14,5,47,[],"[Kusanagi Kyou, Kusanagi (King of Fighters), K...",\n[Sequel to Perishing Little Flame on Winding...,\nChapter Text\nCouple months have passed sinc...,2021-01-04,2021-05-15 18:01:46.995844
2,30910631,❃ 𝐄𝐏𝐇𝐄𝐌𝐄𝐑𝐀𝐋 || ᴘɪᴇᴛʀᴏ ᴍᴀxɪᴍᴏꜰꜰ ❃,MiniSized,Teen And Up Audiences,"[Marvel Cinematic Universe, Marvel]","[Avengers: Age of Ultron (Movie), Fluff, Angst...","Graphic Depictions Of Violence, Major Characte...",No category,0,18,416,[Pietro Maximoff/Reader],"[Wanda Maximoff, Pietro Maximoff, Vision (Marv...",\n⁽ᵃᵈʲ‧⁾ ˡᵃˢᵗⁱⁿᵍ ᶠᵒʳ ᵃ ᵛᵉʳʸ ˢʰᵒʳᵗ ᵃᵐᵒᵘⁿᵗ ᵒᶠ ᵗⁱ...,"\nChapter Text\n Sokovia, Europe HYDRA Resea...",2021-04-26,2021-05-15 18:01:47.270095
3,31165844,Warmth part II,"for JoeyWrites, jordypordy",Not Rated,[NieR: Automata (Video Game)],"[Angst, Hurt/Comfort, Fluff, Homoeroticism, En...",Choose Not To Use Archive Warnings,M/M,6,6,48,[9S/801S (NieR: Automata)],"[9S (NieR: Automata), 801S (NieR: Automata)]",\ntwo star-crossed lovers have one final chanc...,\nHe never comes back. This much 801S knows. E...,2021-05-08,2021-05-15 18:01:48.614115
4,31165823,love in the dark,ellyxts,Explicit,[Haikyuu!!],"[Angst, literally just angst, No Fluff, sorry ...",No Archive Warnings Apply,M/M,8,16,331,[Miya Atsumu/Sakusa Kiyoomi],"[Sakusa Kiyoomi, Miya Atsumu]",\nthe argument grew from nowhere into a sudden...,"""the person ya were, the one i fell in love wi...",2021-05-08,2021-05-15 18:01:48.796242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97873,30860549,the road less traveled by,andalucite,Teen And Up Audiences,[Leverage],"[POV Parker (Leverage), Alternate Universe - C...",No Archive Warnings Apply,"F/M, M/M, Multi",42,103,481,"[Alec Hardison/Parker/Eliot Spencer, Parker & ...","[Sophie Devereaux (Leverage), Nathan Ford, Dam...",\nParker has an itch between her shoulder blad...,Parker stands on the edge of the tallest build...,2021-04-23,2021-05-21 14:35:20.913222
97874,30860621,Something there,for sweeetbabe,Teen And Up Audiences,[Harry Potter - J. K. Rowling],"[Family Fluff, Developing Relationship, Fluff ...",No Archive Warnings Apply,"F/M, Gen",12,43,346,"[James Potter/Lily Evans Potter, Hermione Gran...","[James Potter, Lily Evans Potter, Sirius Black...",\nThere is something going on between Ron and ...,\nHarry looks at the letter Errol just droppe...,2021-04-23,2021-05-21 14:35:21.083597
97875,30847931,WILD,for lumosinlove,Teen And Up Audiences,[Relic Keel - lumosinlove],"[Pining, Mutual Pining, Songfic, Based on the ...",No Archive Warnings Apply,No category,2,7,44,[Luke Deveaux/Saint (lumosinlove)],"[Saint (lumosinlove), Luke Deveaux (lumosinlove)]",\nSaint and Luke are in a friends with benefit...,\nTrying hard not to fallOn the way homeYou we...,2021-04-22,2021-05-21 14:40:36.595024
97876,30847880,pharmakon,sincethestars,Teen And Up Audiences,"[Minecraft (Video Game), Dream SMP - Fandom, V...","[Tubbo-centric, Character Study, Relationship ...",No Archive Warnings Apply,Gen,4,116,668,"[Toby Smith | Tubbo & TommyInnit, Ranboo & Tob...","[Toby Smith | Tubbo, TommyInnit (Video Bloggin...","\nat one point they wander to the farm, rootin...","something is bothering tommy.well, more than u...",2021-04-22,2021-05-21 14:40:36.773876


In [65]:
pickle.dump(existed_df, open('data/ao3_db.p', 'wb'))