In [1]:
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import re
import pandas as pd
from itertools import chain
import time 
from google.colab import files

In [2]:
# setting up dataframe with columns required
scp_df = pd.DataFrame(columns=['code', 'title', 'text', 'image captions', 'rating', 'state', 'tags',  'link'], index=None)

In [3]:
http = httplib2.Http()

In [None]:
# the page links all follow pattern and can loop through nums to string just zero padded

for n in range(1,7000):
  # timeout to limit load on the site (based on api requests per minute limit)
  if n % 240 == 0:
    print(n)
    time.sleep(60)

  # getting the number as a string in the way need to get url
  num_as_str = '{}{}'.format('0' * (3 - len(str(n))), str(n)) if n < 1000 else str(n)

  # requesting page for current/target scp, parsing with beautifulsoup
  url = 'https://scp-wiki.wikidot.com/scp-{}'.format(num_as_str)
  status, response = http.request(url)
  soup = BeautifulSoup(response, 'html.parser')

  captions = []
  state = None
  rating = None

  if soup.find('div', {'id': 'u-adult-warning'}) != None:
    url = 'https://scp-wiki.wikidot.com/adult:scp-{}/noredirect/true'.format(num_as_str)

    status, adult_response = http.request(url)
    soup_adult = BeautifulSoup(adult_response)

    state = "age restricted"

    rating_tag = soup_adult.find(class_ = 'number prw54353')

    if rating_tag != None:
      rating = rating_tag.text  

    text = [i for i in soup_adult.find(id = 'page-content').find_all('p')]

    tags_raw = soup_adult.find('div', {'class': 'page-tags'}).find('span').find_all('a')
    tags = " ".join([i.text for i in tags_raw]).replace(",", "")
  else:
    # getting text for the scp within the main content div
    text = [i for i in soup.find(id = 'page-content').find_all('p')]
    

  # p tags include image captions which are excluded as below, may be other similar cases left in
  for i in text:
    try:
      if i.parent.attrs['class'][0] == 'scp-image-caption':
        captions.append(i.text)
        text.remove(i)
    except:
      pass
      
  # getting text from result set
  text = [i.text for i in text]

  # getting the rating - a few do not have this primarily the ones that are 'blocked' or similar
  if rating == None: # accounting for ones that dont have ratings on main page, restricted age or general
    rating = soup.find(class_ = 'number prw54353')
    if rating == None:
      pass
    else:
      rating = rating.text

  # accounting for different directory pages (used to get title) based on number/series - getting correct url below
  if n < 1000:
    status, dir_response = http.request('https://scp-wiki.wikidot.com/scp-series')
  else:
    status, dir_response = http.request('https://scp-wiki.wikidot.com/scp-series-{}'.format(str(int(n//1000)+1)))

  if soup.find(lambda x: x.name=="a" and "clearance credentials" in x.text):
    state = "restricted"
    # add /offset/1 after regular scp url , need to check if works for all
    # note below method for url assumes it can't be both restricted as adult content and as needing clearance, v likely but not certain
    url = url + '/offset/1'
    status, restr_response = http.request(url)
    restr_soup = BeautifulSoup(restr_response, 'html.parser')
    text = [i for i in restr_soup.find(id = 'page-content').find_all('p')]

    # p tags include image captions which are excluded as below, may be other similar cases left in
    for i in text:
      try:
        if i.parent.attrs['class'][0] == 'scp-image-caption':
          captions.append(i.text)
          text.remove(i)
      except:
        pass
    text = [i.text for i in text]
    text = "\"{}\"".format(" \n ".join([i.strip() for i in text if len(i.strip()) > 0])) if len(text) > 0 else None 

    tags_raw = restr_soup.find('div', {'class': 'page-tags'}).find('span').find_all('a')
    tags = " ".join([i.text for i in tags_raw]).replace(",", "")

  # getting tags
  if not state:
    tags_raw = soup.find('div', {'class': 'page-tags'}).find('span').find_all('a') if soup.find('div', {'class': 'page-tags'}) != None else None
    tags = " ".join([i.text for i in tags_raw]).replace(",", "") if tags_raw != None else None
  else:
    # just in case useful tags spread across the two pages here getting from restricted notif page too and adding
    # changes order/location of constant tags but fine
    tags_raw = soup.find('div', {'class': 'page-tags'}).find('span').find_all('a')
    tags = " ".join(list(set([i.text for i in tags_raw] + tags.split()))).replace(",", "")
  # some of these tags have underscores at the start and are not displayed on the page, leaving them in for now

  # requesting directory page
  soup_dir = BeautifulSoup(dir_response, 'html.parser')

  # some logic to get the text title (not scp number code) 
  # using next or previous siblings and numbers where needed as some have no code on the directory page or multiple codes
  if soup_dir.find('a', text='SCP-{}'.format(num_as_str)) == None:
    # checks here due to cases where no previous or next sibling for one of the elements involved
    if soup_dir.find('a', text='SCP-{}'.format(str(n-1))) != None and soup_dir.find('a', text='SCP-{}'.format(str(n-1))).parent.find_next_sibling() != None:
      title = soup_dir.find('a', text='SCP-{}'.format(str(n-1))).parent.find_next_sibling().text.split(" - ")[-1]
    else:
      title = soup_dir.find('a', text='SCP-{}'.format(str(n+1))).parent.find_previous_sibling().text
      # removing the code part of titles that do have a code here
      if ' - ' in title:
        title = title.split(" - ")[-1]

  # basic title finder for most cases
  else:
    title = soup_dir.find('a', text='SCP-{}'.format(num_as_str)).parent.text.split(" - ")[-1]
  
  # quotes to preserve commas and not cause csv issues - joining up p tags with newlines
  text = "\"{}\"".format(" \n ".join([i.strip() for i in text if len(i.strip()) > 0])) if len(text) > 0 else None 

  captions = "\"{}\"".format(" \n ".join(i for i in captions)) if len(captions) > 0 else None

  # setting status - active, deleted, blocked, age restricted, clearance required
  if state: # age restricted and restricted will already have been assigned
    pass
  elif text == None:
    state = "missing"
  elif text and "This page may have been moved or deleted" in text:
    state = "deleted"
  elif "[Blocked]" in title:
    state = "blocked"
  else:
    state = "active" # this catches anomalies too... eg at least one with no content but can filter after as most columns will literally be None?

  info = ['SCP-{}'.format(num_as_str), "\"{}\"".format(title), text, captions, rating, state, tags, url]
  scp_df.loc[len(scp_df)] = info


scp_df.to_csv('scp6999morecolsall.csv', index=False) 

# auto download after saving to file - with timeout to try and ensure the large file will actually be available/ complete
time.sleep(500)
files.download('scp6999morecolsall.csv') 