In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import numpy as np
import webbrowser  

In [2]:
df = pd.read_csv("all_scps_final.csv")
df.head()

Unnamed: 0,Number,Tags,Sub-Objects,Pos Ratings,All Ratings,Word Count,Safe,Euclid,Keter,Thaumiel,...,Neutral,D-Class,Containment Breach,Addendum,Task Force,O5 Council,Agent,[EXPUNGED],[REDACTED],Blacked Out
0,2.0,alive euclid featured scp structure transfigur...,,1832.0,2206.0,857.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,3.0,alive biological computer director-moose eucli...,1 2,824.0,1056.0,1789.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,4.0,euclid mind-affecting portal scp spacetime str...,1 2 7 12 13 14,1154.0,1340.0,1452.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,5.0,adaptive safe scp,,710.0,1268.0,373.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,liquid location medical rewrite safe scp self-...,,686.0,1212.0,491.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [3]:
# create own tokenizer to keep single characters
import re
def my_tokenizer(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)
    # split based on whitespace
    return re.split("\\s+",text)

# vectorize the Tags and Sub-Objects columns, save them as arrays
vectorizer = CountVectorizer(tokenizer = my_tokenizer)
objects_array = vectorizer.fit_transform(df["Sub-Objects"])
objects_array = objects_array.toarray()

vectorizer = CountVectorizer()
tags_array = vectorizer.fit_transform(df["Tags"])
tags_array = tags_array.toarray()

# make the arrays into lists and add them into their own columns in a new dataframe
df_array = pd.DataFrame()
df_array["Number"] = df["Number"]
df_array = df_array.set_index("Number")
df_array["Tags Array"] = list(tags_array)
df_array["Objects Array"] = list(objects_array)

df_array.head()

Unnamed: 0_level_0,Tags Array,Objects Array
Number,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
# number array matrix
# all x features to be counted
x_feat_list = ["Pos Ratings", "All Ratings", "Word Count", "Safe", "Euclid", "Keter", "Thaumiel", "Anomalous", "Neutral", 
             "D-Class", "Containment Breach", "Addendum", "Task Force", "O5 Council", "Agent", "[EXPUNGED]", "[REDACTED]", "Blacked Out"]

# normalize the x features in new dataframe
df_norm = pd.DataFrame()
for feat in x_feat_list:
    df_norm[feat] = df[feat] / df[feat].std()

    
# extract data into numpy format
num_array = df_norm.values

# add numpy array to df_array
df_array["Num Array"] = list(num_array)
df_array.head()

Unnamed: 0_level_0,Tags Array,Objects Array,Num Array
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6.989315534162876, 7.657336364724377, 0.47474..."
3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[3.143665938946621, 3.665524569877127, 0.99103..."
4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[4.402658365951942, 4.651328526169839, 0.80435..."
5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.7087412823447825, 4.4014063964054895, 0.206..."
6.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.6171781967443954, 4.207022517699884, 0.2719..."


In [5]:
def get_similarities(scp, df_array):
    ''' # gets cosin similarities between given scp and all other scps, returns scps with closest similarities
    Args:
        scp (int): the number of the scp you want similar ones to
        df_array (dataframe): a dataframe with arrays representing the tags, subobjects, and other values from the original 
                                dataframe containing all scps
    Returns:
        similarities (dataframe): the similarity scores for each scp to the given scp
    '''
    # get vectors for the given scp
    tag_vec = df_array.loc[scp]["Tags Array"]
    obj_vec = df_array.loc[scp]["Objects Array"]
    num_vec = df_array.loc[scp]["Num Array"]
   
    # get similarity to the given scp for each scp in the dataframe, store in dataframe with scp number
    # create dataframe with a column as the scp number
    similarities = pd.DataFrame(columns = ["Number", "Similarity"])
    
    # for each scp, calculate the similarity score and put into Series
    for idx, row in df_array.iterrows():
       # get vectors for current scp
        tag2_vec = df_array.loc[idx]["Tags Array"]
        obj2_vec = df_array.loc[idx]["Objects Array"]
        num2_vec = df_array.loc[idx]["Num Array"]
        
        # calculate similarities for given scp and each scp
        tag_sim = cosine_similarity(tag_vec.reshape(1, -1), tag2_vec.reshape(1, -1))
        obj_sim = cosine_similarity(obj_vec.reshape(1, -1), obj2_vec.reshape(1, -1))
        num_sim = cosine_similarity(num_vec.reshape(1, -1), num2_vec.reshape(1, -1))
        
        # calculate overall similarity by adding
        # weight the factors as desired- objects are worth less, tags are worth a lot
        overall_sim = (1*tag_sim) + (0.5*obj_sim) + (0.6*num_sim)
        
        # add similarites together and put into dataframe
        similarities = similarities.append({"Number" : idx, "Similarity": overall_sim}, ignore_index=True)
    
    # sort the similarites descending
    similarities.sort_values(by=['Similarity'], ascending = [False], inplace=True)
    
    return(similarities)

In [23]:
def recommend_scp(scp, df_array, df_scp, num_recs = 5, open_first = True):
    '''recommends a certain number of scps based on a given scp, displays their info in table and opens first recommendation
    Args:
        scp (int): number of scp
        df_array (dataframe): dataframe with arrays representing the tags, subobjects, and other values from the original 
                                dataframe containing all scps
        df_scp (dataframe): the original dataframe containing all scps
        num_recs (int): the number of recommendations to return
        open_first (boolean): whether or not to automatically open a tab for the first recommendation
    Returns:
        df_recs (dataframe): a dataframe with info, including links and similarity score, for the top similar scps
    
    '''
    # possibly put part in for what to do if scp not in the array
    
    # get the similarities
    similarities = get_similarities(scp, df_array)
    
    # get top (num_recs) in the similarities
    recs = list(similarities.iloc[0:num_recs + 1]["Number"])
    
    # create links to the top (rec) similar ones
    links = []
    for i in recs:
        # make sure scp number is in proper format for link
        # turn float into int and string
        i = str(int(i))
        # if the number is less than 3 numbers long, must add zeros in front
        while len(i) < 3:
            i = '0' + i   
            
        # create link  
        link = f'https://scp-wiki.wikidot.com/scp-{i}'
        links.append(link)
        
    # create new dataframe with the top recommended scps
    df_recs = pd.DataFrame()
    for i in recs:
        df_recs = df_recs.append(df_scp.loc[df["Number"] == i])
    
    # add links as a row in the dataframe
    df_recs["Link"] = links
    
    # add the similarity score to the dataframe, set similarity score of original to say "original"
    sim_score = list(similarities.iloc[0:num_recs + 1]["Similarity"])
    sim_score[0] = "Original"
    df_recs["Similarity Score"] = sim_score
    
    
    # open the top recommendation if open_first = True
    if open_first == True:
        webbrowser.open_new_tab(links[1])
        
    # make Number the index, add link and similarity score to the front of the dataframe
    df_recs = df_recs.set_index("Number")
    cols_to_move = ["Similarity Score", "Link"]
    df_recs = df_recs[cols_to_move + [x for x in df_recs.columns if x not in cols_to_move]]
    
    # return the top recommendations
    return(df_recs)

In [24]:
recs = recommend_scp(1509, df_array, df, 10)
recs

Unnamed: 0_level_0,Similarity Score,Link,Tags,Sub-Objects,Pos Ratings,All Ratings,Word Count,Safe,Euclid,Keter,...,Neutral,D-Class,Containment Breach,Addendum,Task Force,O5 Council,Agent,[EXPUNGED],[REDACTED],Blacked Out
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1509.0,Original,https://scp-wiki.wikidot.com/scp-1509,ectoentropic featured humanoid insect safe scp...,1 2,197.0,233.0,1226.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3118.0,[[2.127712642558148]],https://scp-wiki.wikidot.com/scp-3118,ectoentropic predictive safe scp weapon,1,174.0,194.0,729.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2428.0,[[2.081586059537653]],https://scp-wiki.wikidot.com/scp-2428,cognitohazard document dr-wondertainment hallu...,1 2,296.0,312.0,2025.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3876.0,[[2.077127900289232]],https://scp-wiki.wikidot.com/scp-3876,artifact ectoentropic safe scp,1 2,15.0,43.0,1852.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
680.0,[[2.047065401108113]],https://scp-wiki.wikidot.com/scp-680,autonomous safe scp skeletal thermal transmission,1 2,60.0,152.0,836.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2097.0,[[2.0313149257998897]],https://scp-wiki.wikidot.com/scp-2097,butterfly co-authored document insect safe sap...,1 2,50.0,90.0,1178.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2473.0,[[2.0183889130491335]],https://scp-wiki.wikidot.com/scp-2473,electronic extraterrestrial mechanical safe sc...,1 2,85.0,105.0,3089.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2382.0,[[2.006376646178845]],https://scp-wiki.wikidot.com/scp-2382,artistic extradimensional humanoid mind-affect...,1 2 3,19.0,97.0,2271.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1990.0,[[1.9997167639544096]],https://scp-wiki.wikidot.com/scp-1990,ectoentropic feline safe scp sleep telepathic toy,1 2,169.0,301.0,851.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3239.0,[[1.9959889464749443]],https://scp-wiki.wikidot.com/scp-3239,auditory neurological safe scp,1 2,21.0,67.0,1995.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


Two very good similar scps for project example:
SCP-1509 was put in to find a similar one
Returned SCP-3118.
1509 is a knife that, when someone is cut with, ants come out of the wound.
3118 is gun that, when fired at someone's head, a lunch that the person once ate spontaneously appears. 