# CSV Creator
This code contains the pipeline used to create the csv file of all the SCPs. It takes a LOOOOONG time to run for all the SCPs.

## Pipeline Overview

#### Webscraping 
- 'get_scp_soup()'
    - gets soup object for a given scp number
- 'get_scp_maintext()'
    - gets the main text from the scraped scp
- 'get_class()'
    - gets the verbatim class of the scraped scp
- 'get_class_booleans()'
    - gets boolean values for the 5 main classes
- 'get_rating()'
    - gets the positive and overall ratings from the scraped scp
- 'get_subobjects()'
    - get a list of subobjects 1-9 and A-Z that exist in scraped scp
- 'get_tags()'
    - gets the tags for the scraped scp

#### Formatting Data
- 'get_dict()'
    - gets the dictionary with attributes for a certain scp
- 'scp_num()'
    - changes number to be readible by the scp wiki html

In [1]:
# all necessary imports
import requests
import json
from bs4 import BeautifulSoup
import string
import pandas as pd

## Pipeline for getting the Data

In [2]:
def get_scp_soup(scp):
    '''gets Beautiful Soup of the given scp number
    Args:
        scp (int): the number of the scp 
    Returns:
        scp_soup (Beautiful Soup object): soup object from the scp html
    '''
    # first, it gets the html for the given scp
    url = f'https://scp-wiki.wikidot.com/scp-{scp}'
    html = requests.get(url).text
    
    # get the soup for the html
    scp_soup = BeautifulSoup(html)
    
    # return the soup
    return(scp_soup)

In [3]:
def get_scp_maintext(num, scp_soup):
    '''gets the main story text from the scp soup object
    Args:
        num (int): the number of the scp the soup is for
        scp_soup (Beautiful Soup object): soup object for the scp's webpage
    Returns:
        scp_story (str): string that contains the story from the scp webpage
    '''
    # get the text using div and page-content from the soup
    scp_fulltext = (scp_soup.find("div", id = "page-content")).text
    
    # the text potentially has things before and after the actual story, so subset this text into just the story
    # we are assuming the story starts once the item is named (Item #: SCP-XXXX)
    # we are assuming the story ends when the bottom of the screen displays links to the next/previous story (« SCP-XXXX - 1)
    # not all SCPs are in this format- if the scp is not in this format, then the text will be ''
    
    # create the words that indicate the story starts/stops and find its location in the full text
    start = scp_fulltext.find(f'Item #: SCP-{num}')
    stop = scp_fulltext.find(f'« SCP-{int(num)-1}')
    
    # subset the full text by these location
    scp_story = scp_fulltext[start : stop]
    
    
    # there are some special cases of scps in different formats- if the above code didn't get the text, try the special case
    if len(scp_story) == 0:
        start = scp_fulltext.find(f'Item#: {num}')
        scp_story = scp_fulltext[start : stop]
        
    if len(scp_story) == 0:
        start = scp_fulltext.find(f'Item#: SCP-{num}')
        scp_story = scp_fulltext[start : stop]
    
    return(scp_story)

In [4]:
# gets positive and overall rating for the scp
def get_rating(soup):
    '''gets the positive and overall ratings for the scp
    Args:
        soup (Beautiful Soup object): soup for the scp
    Return:
        ratings (list): list with positive rating, overall rating    
    '''
    # get the 8th class=image, which is the image of the rating box, and get the src, which is the link
    text = soup.find_all(class_ = "image")[7]["src"]
    
    # clean the link to get the ratings from it
    link = text.split("&")
    ratings = [int(link[3].replace("rating=", "")), int(link[4].replace("rating_votes=", ""))]
    
    return(ratings)

In [5]:
# gets sub-objects
def get_subobjects(num, text):
    '''gets a list of the letters of all subobjects for an scp
    Args:
        num (int): the number of the scp
        text (str): the text of the main scp story
    Return:
        sub_objects (list): a list of the letters for all sub objects    
    '''
    # create string for subobjects
    sub_objects = ""
   
    # create list of all possible subobjects (1-100, A-Z)
    possible_subobjs = list(string.ascii_uppercase) + list(range(1, 101))
    
    # go through each subobject and see if its in the text
    for i in possible_subobjs:
        # check if subobject in text
        is_sub = f'SCP-{num}-{i} ' in text
        # if it is in the text, add it to the string
        if is_sub == True:
            if len(sub_objects) > 0:
                sub_objects = sub_objects + " " + str(i)
            else:
                sub_objects = str(i)
    
    # if there are subobjects, return them as string (each object seperated by space)
    if len(sub_objects) > 0:
        return sub_objects
    # otherwise return the word "None"- word will be used for similarity analysis
    else:
        return("None")

In [6]:
# make tags string seperated by spaces

def get_tags(soup):
    ''' gets the tags on the bottom of the page for the scp
    Args:
        soup (Beautiful Soup object): the beautiful soup object for the scp
    Returns:
        tags (list): a list of the tags
    '''
    # get the part of the soup that has the page tags
    scp_tags = soup.find(class_ = "page-tags")
    
    # the page tags is an amalgamation of things, seperate them by their a to get a list of each tag
    scp_tags_list = scp_tags.find_all("a")

    # loop over the tags and add the text of all the tags to a string seperated by commas
    tags = ""
    for tag in scp_tags_list:
        # some tags start with"_", we dont want those, so only add ones that don't start with "_"
        if (tag.text)[0] != "_":
            if len(tags) > 0:
                tags = tags + " " + tag.text
            else: 
                tags = tag.text
        
    return(tags)

In [7]:
def get_class(text):
    '''returns class of SCP based on SCP text
    Args:
        text (str): text of SCP page (according to scp_dataframe())
    Return:
        class (str): string of SCP class (e.g. 'Euclid')
    '''
    try:
        start = text.index("Object Class:")
        text = text[start+14:]
        end = text.index("\n")
    except ValueError:
        try:
            start = text.index("Containment Class:")
            text = text[start+19:]
            end = text.index("\n")
        except:
            return None
    
    return text[:end]

In [8]:
def get_class_booleans(class_str):
    '''returns booleans for Safe, Euclid, Keter, Thaumiel, Anomalous, Neutralized
    Args:
        class_str (str): text of class directly from web-scrape
    Return:
        safe, euclid, keter, thaumiel, anom, neutral (all boolean)
    '''
    
    safe, euclid, keter, thaumiel, anom, neutral = 0, 0, 0, 0, 0, 0
    
    if "Safe" in class_str:
        safe = 1
    if "Euclid" in class_str:
        euclid = 1
    if "Keter" in class_str:
        keter = 1
    if "Thaumiel" in class_str:
        thaumiel = 1
    if "Anomalous" in class_str:
        anom = 1
    if "Neutralized" in class_str or "NEUUtrallized" in class_str:
        neutral = 1
        
    return safe, euclid, keter, thaumiel, anom, neutral

In [9]:
def get_dict(num):
    ''' get a dictionary with information about different attributes for an scp
    Args:
        num (str or int): number of scp

    Returns: 
       dic_scp (dict): dictionary with information about different attributes for the scp    
    '''
     # create dic to add scp attributes to
    dic_scp = {}
    
    # get the random scp's soup (Beautiful Soup object)
    soup = get_scp_soup(num)
           
    # get rating of scp
    ratings = get_rating(soup)
        
    # get the tags of the scp
    tags = get_tags(soup)

    # get the main text of the scp
    text = get_scp_maintext(num, soup)
     
    # get class of scp
    scp_class = get_class(text)
    safe, euclid, keter, thaumiel, anom, neutral = get_class_booleans(scp_class)
    
    # get word count
    word_count = len(text.split())
        
    # get sub objects
    sub_objects = get_subobjects(num, text)
        
    # getting  1 or 0 for if certain things are in the text:
    dclass = int("D-Class" in text) or int("D-class" in text) or int("Personnel D-" in text)
    blacked_out = int("█" in text)
    redacted = int("REDACTED" in text)
    expunged = int("EXPUNGED" in text)
    agent = int("agent" in text) or int("Agent" in text)
    O5 = int("O5" in text)
    task_force = int("Mobile Task Force" in text)
    addendum = int("Addendum" in text)
    breach = int("containment breach" in text) or int("Containment breach" in text)
     
    # create dic to add scp attributes to, to eventually add to dataframe
    dic_scp = {"Number" : num, "Pos Ratings" : ratings[0], "All Ratings" : ratings[1], "Tags" : tags, 
               "Sub-Objects" : sub_objects, "Word Count" : word_count, 
               "Safe" : safe, "Euclid" : euclid, "Keter" : keter, "Thaumiel" : thaumiel, "Anomalous" : anom,
               "Neutral" : neutral, "D-Class" : dclass, "Containment Breach" : breach, "Addendum" : addendum,
               "Task Force" : task_force, "O5 Council" : O5, "Agent" : agent, "[EXPUNGED]" : expunged, 
               "[REDACTED]" : redacted, "Blacked Out" : blacked_out
                }
    return(dic_scp)

## Make CSV

In [10]:
# create dictionary for every possible scp, and add to dataframe
scp_df = pd.DataFrame()

In [11]:
def scp_num(num):
    '''changes number to be readible by the scp wiki html
    Args:
        num (int or str): number
    Returns:
        n (str): number in format readible by scp wiki html (so 45 becomes 045)
    
    '''
    n = "000" + str(num)
    while len(n) > 4:
        n = n.replace("0", "", 1)
    if n[0] == "0":
        n = n.replace("0", "", 1)
    return(n)

In [15]:
# create list of all scp numbers
# the first 99 numbers need to be changed to have 0 or 00 in front
num_list = []
for i in range(1, 100):
    num_list.append(scp_num(i))
num_list = num_list + (list(range(100, 8000)))

In [17]:
# loop through all the numbers, add to dataframe only if it was able to get the text
for i in num_list:
    try:
        scp_dict = get_dict(i)
        scp_df = scp_df.append(scp_dict, ignore_index=True)
    except: 
        pass
scp_df.head()

# save the dataframe to a csv
scp_df.to_csv("all_scps_final.csv")