In [1]:
# random scp generator
import random

def rand_scp_num(x = 1):
    ''' generates x number of random numbers from 1-5999 that corresponds to an scp page
    Args:
        x = 1 (int): integer for amount of scp numbers you want returned
    Returns:
        scp_num (list): a list with values from 1-5999, formatted to correctly open an scp page
    '''
    # create list to store scp numvers in
    scp_nums = []
    for i in range(x):
        n = "000" + str(random.randint(1, 5999))
        while len(n) > 4:
            n = n.replace("0", "", 1)
        if n[0] == "0":
            n = n.replace("0", "", 1)
        scp_nums.append(n)
    return(scp_nums)

In [2]:
# function to input scp number into and return soup object for that scp's webpage
import requests
import json
from bs4 import BeautifulSoup

def get_scp_soup(scp):
    '''gets Beautiful Soup of the given scp number
    Args:
        scp (int): the number of the scp 
    Returns:
        scp_soup (Beautiful Soup object): soup object from the scp html
    '''
    # first, it gets the html for the given scp
    url = f'https://scp-wiki.wikidot.com/scp-{scp}'
    html = requests.get(url).text
    
    # get the soup for the html
    scp_soup = BeautifulSoup(html)
    
    # return the soup
    return(scp_soup)

In [59]:
def get_scp_maintext(num, scp_soup):
    '''gets the main story text from the scp soup object
    Args:
        num (int): the number of the scp the soup is for
        scp_soup (Beautiful Soup object): soup object for the scp's webpage
    Returns:
        scp_story (str): string that contains the story from the scp webpage
    '''
    # get the text using div and page-content from the soup
    scp_fulltext = (scp_soup.find("div", id = "page-content")).text
    
    # the text potentially has things before and after the actual story, so subset this text into just the story
    # we are assuming the story starts once the item is named (Item #: SCP-XXXX)
    # we are assuming the story ends when the bottom of the screen displays links to the next/previous story (« SCP-XXXX - 1)
    
    # create the words that indicate the story starts/stops and find its location in the full text
    start = scp_fulltext.find(f'Item #: SCP-{num}')
    stop = scp_fulltext.find(f'« SCP-{int(num)-1}')
    
    # subset the full text by these location
    scp_story = scp_fulltext[start : stop]
    
    return(scp_story)
    

In [64]:

d = get_dict(2171)
pd.DataFrame([d])

Unnamed: 0,Number,Class,Pos Ratings,Pos Rating Rate,D-Class,Tags,Text,Sub-Objects,Word Count,Sentiment
0,2171,Euclid Safe,90,0.714286,False,"[acoustic, autonomous, extradimensional, human...",Item #: SCP-2171\nObject Class: Euclid Safe\nS...,[1],1597,


In [4]:
# gets positive and overall rating for the scp
def get_rating(soup):
    '''gets the positive and overall ratings for the scp
    Args:
        soup (Beautiful Soup object): soup for the scp
    Return:
        ratings (list): list with positive rating, overall rating    
    '''
    # get the 8th class=image, which is the image of the rating box, and get the src, which is the link
    text = soup.find_all(class_ = "image")[7]["src"]
    
    # clean the link to get the ratings from it
    link = text.split("&")
    ratings = [int(link[3].replace("rating=", "")), int(link[4].replace("rating_votes=", ""))]
    
    return(ratings)

In [63]:
import string
def get_subobjects(num, text):
    '''gets a list of the letters of all subobjects for an scp
    Args:
        num (int): the number of the scp
        text (str): the text of the main scp story
    Return:
        sub_objects (list): a list of the letters for all sub objects    
    '''
    # create list for subobjects
    sub_objects = []
   
    lets = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for j in list(string.ascii_uppercase):
        lets.append(j)
    
    for i in lets:
        is_sub = f'SCP-{num}-{i} ' in text
        # if it exsits, add the letter into list of subobjects and then make the letter the next letter
        if is_sub == True:
            sub_objects.append(i)
            
    if len(sub_objects) > 0:
        return sub_objects
    else:
        return None
    
# to improve subobjects:
#    - find way to get sub-objecets >9, some objects have them

In [6]:
def get_tags(soup):
    ''' gets the tags on the bottom of the page for the scp
    Args:
        soup (Beautiful Soup object): the beautiful soup object for the scp
    Returns:
        tags (list): a list of the tags
    '''
    # get the part of the soup that has the page tags
    scp_tags = soup.find(class_ = "page-tags")
    
    # the page tags is an amalgamation of things, seperate them by their a to get a list of each tag
    scp_tags_list = scp_tags.find_all("a")

    # loop over the tags and add the text of all the tags to a list
    tags = []
    for tag in scp_tags_list:
        # some tags start with"_", we dont want those, so only add ones that don't start with "_"
        if (tag.text)[0] != "_":
            tags.append(tag.text)
        
    return(tags)

In [7]:
def get_class(text):
    '''returns class of SCP based on SCP text
    Args:
        text (str): text of SCP page (according to scp_dataframe())
    Return:
        class (str): string of SCP class (e.g. 'Euclid')
    '''
    try:
        start = text.index("Object Class:")
    except ValueError:
        return None
    text = text[start+14:]
    end = text.index("\n")
    
    return text[:end]

In [25]:
def get_dict(num):
    ''' get a dictionary with information about different attributes for an scp
    Args:
        num (int): number of scp

    Returns: 
       dic_scp (dict): dictionary with information about different attributes for the scp    
    '''
     # create dic to add scp attributes to
    dic_scp = {}
    
    # get the random scp's soup (Beautiful Soup object)
    soup = get_scp_soup(num)
           
    # get rating of scp
    ratings = get_rating(soup)
        
    # get the tags of the scp
    tags = get_tags(soup)

    # get the main text of the scp
    text = get_scp_maintext(num, soup)
            
    scp_class = get_class(text)
        
    word_count = len(text.split())
        
    # get list of sub objects
    sub_objects = get_subobjects(num, text)
        
    # get boolean of if D-Class is mentioned in text
    dclass = "D-Class" in text
        
        
    # create dic to add scp attributes to, to eventually add to dataframe
    dic_scp = {"Number" : num, "Class" : scp_class, "Pos Ratings" : ratings[0], "Pos Rating Rate" : ratings[0]/ratings[1], 
                   "D-Class" : dclass, "Tags" : tags, "Text" : text, "Sub-Objects" : sub_objects, 
               "Word Count" : word_count, "Sentiment" : None}
    return(dic_scp)

In [60]:
import pandas as pd

def get_random_dataframe(x=1):
    '''creates dataframe with attributes for a number of random scps
    Args:
        x=1 (int): optional number of how many scps to put in the dataframe
    Return:
        df_scp (df): dataframe with attributes for a number of random scps
    '''  
    
    scp_nums = rand_scp_num(x)
    # create x random scp numbers
    
    # create dataframe with column names but no data
    df_scp = pd.DataFrame(columns = ["Number", "Class", "Pos Ratings", "Pos Rating Rate", "Sub-Objects", 
                                     "D-Class", "Tags", "Text", "Word Count", "Sentiment"])
    
    # loop through each random scp created, and get the dictionary of its attributes
    for num in scp_nums:
        scp_dict = get_dict(num)
        
        # add dict as another row in the scp dataframe, but only if there is text
        if scp_dict["Text"] == '':
            print(f'Access to SCP-{num} is restricted')
        else:
            df_scp = df_scp.append(scp_dict, ignore_index=True)
        
    # set the index as the SCP number in the dataframe    
    df_scp = df_scp.set_index("Number")
    
    return(df_scp)
    

In [65]:
df_scp = get_random_dataframe(10)
df_scp

Access to SCP-2999 is restricted
Access to SCP-4225 is restricted
Access to SCP-5452 is restricted


Unnamed: 0_level_0,Class,Pos Ratings,Pos Rating Rate,Sub-Objects,D-Class,Tags,Text,Word Count,Sentiment
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4328,Euclid,22,0.289474,[1],False,"[euclid, extradimensional, portal, scp]",Item #: SCP-4328\nObject Class: Euclid\nSpecia...,1124,
5609,Keter,45,0.737705,,False,"[biological, extraterrestrial, insect, jam-con...",Item #: SCP-5609\nObject Class: Keter\n\n\n\n\...,514,
70,Safe,165,0.48105,,False,"[alive, autonomous, humanoid, mechanical, safe...",Item #: SCP-070\nObject Class: Safe\nSpecial C...,824,
447,Safe,824,0.780303,"[1, 2]",False,"[cadaver, ectoentropic, liquid, safe, scp, sph...",Item #: SCP-447\nObject Class: Safe\nSpecial C...,1000,
3172,Safe,92,0.884615,,False,"[location, plant, safe, scp, telepathic, westh...",Item #: SCP-3172\nObject Class: Safe\nSpecial ...,1330,
4192,Safe,301,0.893175,,False,"[electromagnetic, microscopic, miniature, revi...",Item #: SCP-4192\nObject Class: Safe\n\n\nSite...,498,
4882,Euclid,157,0.928994,,False,"[broken-god, computer, electronic, euclid, met...",Item #: SCP-4882\nObject Class: Euclid\n\n\nDe...,1831,
