In [22]:
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm, trange
import time
import pickle
import datetime
import pandas as pd

In [95]:
def get_text_by_id(id_):
    url = 'https://archiveofourown.org/works/{}?view_full_work=true'
    headers = {'user-agent': 'bot (sj784@cornell.edu)'}
    
    r = requests.get(url.format(id_), headers=headers)
    soup = BeautifulSoup(r.text)
    
    assert soup != None
        
    stats = soup.find('dd', class_='stats')
    
    assert stats != None
    
    published_date = stats.find('dl', class_='stats').find('dd', class_='published').get_text() \
        if stats.find('dd', class_='published') else ''
    
    output = ''
    if soup.find_all('div', class_="userstuff module", role='article'):
        for chapter in soup.find_all('div', class_="userstuff module", role='article'):
            output += chapter.get_text()+'\n'
    
    elif soup.find('div', class_='userstuff'):
        output += soup.find('div', class_='userstuff').get_text()
    
    else:
        # print(f'can not find text for {id_}')
        raise AssertionError
        
    return output, published_date

def extract_info(id_, html):
    stats = html.find('dl', class_='stats')
    text, published_date = get_text_by_id(id_)
    return {
        'id': id_, # id
        'title': html.find('h4', class_='heading').get_text().split('\n')[1], # title
        'author': html.find('h4', class_='heading').get_text().split('\n')[-2],
        'rating': html.find_all('span', class_="text")[0].get_text(), # rating
        'fandoms': [tag.get_text() for tag in html.find('h5', class_='fandoms heading').find_all('a')], # fandoms
        'tags': [tag.get_text() for tag in html.find_all('li', class_='freeforms')], # tags
        'warning': html.find_all('span', class_="text")[1].get_text(), # warning
        'pairing': html.find_all('span', class_="text")[2].get_text(), # pairing,
        'comments': stats.find('dd', class_='comments').get_text() if stats.find('dd', class_='comments') else 0,
        'kudos': stats.find('dd', class_='kudos').get_text() if stats.find('dd', class_='kudos') else 0,
        'hits': stats.find('dd', class_='hits').get_text(),
        'relationships': [tag.get_text() for tag in html.find_all('li', class_='relationships')],
        'characters': [tag.get_text() for tag in html.find_all('li', class_='characters')],
        'summary': html.find('blockquote', class_='userstuff summary').get_text() if html.find('blockquote', class_='userstuff summary') else '',
        'text': text, # content
        'published_date': published_date,
        'timestamp': datetime.datetime.now()
    }
    

def is_valid(html):
    status = html.find_all('span', class_="text")[-1].get_text()
    
    stats = html.find('dl', class_='stats')
    words_count = stats.find('dd', class_='words').get_text().replace(',','')
    
    if not words_count:
        return False
    
    language = stats.find('dd', class_='language').get_text() 
    
    return status == 'Complete Work' and language == 'English' \
        and int(words_count) > 1000 and int(words_count) < 25000

    
def get_ids_by_tag(tag, num_required, existed, page_offset=0):
    url = 'https://archiveofourown.org/tags/{}/works?page={}'
    headers = {'user-agent': 'bot (sj784@cornell.edu)'}
    output = []
    
    page = 1 + page_offset
    delay = 300
    
    
    while True:
        page_count = 0

        r = requests.get(url.format(tag, page), headers=headers)
        soup = BeautifulSoup(r.text)
        
        try:
            error_cyle = 0
            
            works = soup.find_all('ol', class_='work index group')
            assert len(works) > 0
            
            for fanfic in works[0].find_all('li', role='article'):
                id_ = fanfic.find('h4', class_='heading').find('a', href=True)['href'].split('/')[-1]
                
                if id_ in existed:
                    continue
                if not is_valid(fanfic): 
                    continue
                
                output.append(extract_info(id_, fanfic))
                page_count += 1

            print(f'Scraped {page_count} from page {page}. Total: {len(output)}')
            
            if len(output) >= num_required:
                break 
            
            page += 1
        
        except AssertionError:
            print(f'Timed out. Will try again in {delay} sec')
            error_cyle += 1
            
            if error_cyle > 3:
                print('Waited too long. Exit')
                return output
            
            time.sleep(delay)
            page += 5
        

    
    print(f'Job complete. Scraped {len(output)} [{tag}] fanfics in total. Exit')
    return output

In [108]:
existed_df = pickle.load(open('data/ao3_db.p', 'rb'))
old_ids = set(existed_df['id'].tolist())
len(old_ids)

14056

In [None]:
scraped_ids = set()
data = []
for iter_ in range(10):
    print('Iteration', iter_+1)
    for tag in ['Angst', 'Fluff', 'Smut', 'Romance', 'Alternate%20Canon', 
                'Alternate%20Universe', 'Relationship(s)', 
                'Hurt*s*Comfort', 'Sexual%20Content']:
        old_ids |= scraped_ids
        data += get_ids_by_tag(tag, 100, old_ids, 3000 + iter_ * 100)
        scraped_ids = set([d['id'] for d in data])
    print(f'Scraped {len(data)} so far')

Iteration 1
Scraped 9 from page 3001. Total: 9
Scraped 6 from page 3002. Total: 15
Scraped 6 from page 3003. Total: 21
Scraped 11 from page 3004. Total: 32
Scraped 11 from page 3005. Total: 43
Scraped 7 from page 3006. Total: 50
Scraped 6 from page 3007. Total: 56
Scraped 5 from page 3008. Total: 61
Scraped 6 from page 3009. Total: 67
Scraped 13 from page 3010. Total: 80
Scraped 6 from page 3011. Total: 86
Scraped 3 from page 3012. Total: 89
Scraped 10 from page 3013. Total: 99
Timed out. Will try again in 300 sec
Scraped 7 from page 3019. Total: 113
Job complete. Scraped 113 [Angst] fanfics in total. Exit
Scraped 8 from page 3001. Total: 8
Scraped 5 from page 3002. Total: 13
Scraped 5 from page 3003. Total: 18
Scraped 7 from page 3004. Total: 25
Scraped 5 from page 3005. Total: 30
Scraped 3 from page 3006. Total: 33
Scraped 5 from page 3007. Total: 38
Scraped 6 from page 3008. Total: 44
Scraped 4 from page 3009. Total: 48
Scraped 12 from page 3010. Total: 60
Scraped 6 from page 3011. 

In [104]:
def add_scraped_to_df(df, scraped_raw):
    scraped_df = pd.DataFrame(scraped_raw)
    mask = scraped_df['text'].str.len() > 1000
    scraped_df = scraped_df.loc[mask]
    return df.append(scraped_df, ignore_index=True)

In [105]:
existed_df = add_scraped_to_df(existed_df, data)

In [106]:
len(existed_df)

14187

In [109]:
existed_df

Unnamed: 0,id,title,author,rating,fandoms,tags,warning,pairing,comments,kudos,hits,relationships,characters,summary,text,published_date,timestamp
0,31166018,"Don't Worry, You Are More Than Enough;",Bang_Daddy_Chan,Not Rated,[Stray Kids (Band)],"[Hwang Hyunjin is a Mess, Hwang Hyunjin is a P...","Choose Not To Use Archive Warnings, No Archive...","M/M, Multi, Other",4,52,502,"[Hwang Hyunjin/Yang Jeongin | I.N, Bang Chan/H...","[Hwang Hyunjin, Yang Jeongin | I.N, Lee Minho ...","\nThis spot where Hyunjin was now, used to be ...",Hyunjin couldn't focus. It wasn't something th...,2021-05-08,2021-05-15 18:01:46.777424
1,28520001,About That Kind of Desire...,Mina_chan95,Mature,[King of Fighters],"[you know I had to do it to 'em, Kyo didn't le...",Graphic Depictions Of Violence,Gen,14,5,47,[],"[Kusanagi Kyou, Kusanagi (King of Fighters), K...",\n[Sequel to Perishing Little Flame on Winding...,\nChapter Text\nCouple months have passed sinc...,2021-01-04,2021-05-15 18:01:46.995844
2,30910631,❃ 𝐄𝐏𝐇𝐄𝐌𝐄𝐑𝐀𝐋 || ᴘɪᴇᴛʀᴏ ᴍᴀxɪᴍᴏꜰꜰ ❃,MiniSized,Teen And Up Audiences,"[Marvel Cinematic Universe, Marvel]","[Avengers: Age of Ultron (Movie), Fluff, Angst...","Graphic Depictions Of Violence, Major Characte...",No category,0,18,416,[Pietro Maximoff/Reader],"[Wanda Maximoff, Pietro Maximoff, Vision (Marv...",\n⁽ᵃᵈʲ‧⁾ ˡᵃˢᵗⁱⁿᵍ ᶠᵒʳ ᵃ ᵛᵉʳʸ ˢʰᵒʳᵗ ᵃᵐᵒᵘⁿᵗ ᵒᶠ ᵗⁱ...,"\nChapter Text\n Sokovia, Europe HYDRA Resea...",2021-04-26,2021-05-15 18:01:47.270095
3,31165844,Warmth part II,"for JoeyWrites, jordypordy",Not Rated,[NieR: Automata (Video Game)],"[Angst, Hurt/Comfort, Fluff, Homoeroticism, En...",Choose Not To Use Archive Warnings,M/M,6,6,48,[9S/801S (NieR: Automata)],"[9S (NieR: Automata), 801S (NieR: Automata)]",\ntwo star-crossed lovers have one final chanc...,\nHe never comes back. This much 801S knows. E...,2021-05-08,2021-05-15 18:01:48.614115
4,31165823,love in the dark,ellyxts,Explicit,[Haikyuu!!],"[Angst, literally just angst, No Fluff, sorry ...",No Archive Warnings Apply,M/M,8,16,331,[Miya Atsumu/Sakusa Kiyoomi],"[Sakusa Kiyoomi, Miya Atsumu]",\nthe argument grew from nowhere into a sudden...,"""the person ya were, the one i fell in love wi...",2021-05-08,2021-05-15 18:01:48.796242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14182,30246960,"""Better Than the Credits"" Shirou x reader (sli...",Genderfluid_insomniac,Mature,[BNA: Brand New Animal (Anime)],"[Fluff, Smut, Fluff and Smut, theyre both over...",Choose Not To Use Archive Warnings,"F/M, Gen, Multi",0,2,125,[Ogami Shirou/Reader],"[Kagemori Michiru, Ogami Shirou]",\nShirou and Y/N have the house alone as are w...,Shirou and Y/N were sitting around on a Friday...,2021-03-24,2021-05-16 12:22:03.753465
14183,30246939,all up in the air,for yanak324,Explicit,[A Song of Ice and Fire - George R. R. Martin],"[fuck i forgot how to tag fics, Alternate Univ...",Choose Not To Use Archive Warnings,F/M,2,8,125,[Lyanna Mormont/Rickon Stark],"[Rickon Stark, Lyanna Mormont]",\nRickon and Lyanna cross paths at their high ...,\n\n\n\nEven amongst the chatter of familiar s...,2021-03-24,2021-05-16 12:22:03.957424
14184,30246948,cosmical laughter,VisionaryPowerhouse,Explicit,[Voltron: Legendary Defender],"[Astral Plane Fucking, hell yeah, relationship...",Choose Not To Use Archive Warnings,M/M,0,18,396,[Keith/Shiro (Voltron)],"[Keith (Voltron), Shiro (Voltron), Black Lion ...",\nKeith has a little surprise encounter in the...,As if Keith wasn’t already thinking about how ...,2021-03-24,2021-05-16 12:22:04.115457
14185,30246921,Imagine This....,MrsParkJimin18,Explicit,[No Fandom],"[Sexual Fantasy, Lesbian Sex, Oral Sex, Vagina...",Choose Not To Use Archive Warnings,F/F,6,27,1057,[Reader/Undisclosed],[Reader],\nJust a little scenario to use your imaginati...,"\nIt’s a quiet Saturday night, the only plans ...",2021-03-25,2021-05-16 12:22:04.280040


In [107]:
pickle.dump(existed_df, open('data/ao3_db.p', 'wb'))