In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import numpy as np

In [36]:
df = pd.read_csv("all_scps_final.csv")
df.head()

Unnamed: 0,Number,Tags,Sub-Objects,Pos Ratings,All Ratings,Word Count,Safe,Euclid,Keter,Thaumiel,...,Neutral,D-Class,Containment Breach,Addendum,Task Force,O5 Council,Agent,[EXPUNGED],[REDACTED],Blacked Out
0,2.0,alive euclid featured scp structure transfigur...,,1832.0,2206.0,857.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,3.0,alive biological computer director-moose eucli...,1 2,824.0,1056.0,1789.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,4.0,euclid mind-affecting portal scp spacetime str...,1 2 7 12 13 14,1154.0,1340.0,1452.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,5.0,adaptive safe scp,,710.0,1268.0,373.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,liquid location medical rewrite safe scp self-...,,686.0,1212.0,491.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [112]:
# create own tokenizer to keep single characters
import re
def my_tokenizer(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)
    # split based on whitespace
    return re.split("\\s+",text)

# vectorize the Tags and Sub-Objects columns, save them as arrays
vectorizer = CountVectorizer(tokenizer = my_tokenizer)
objects_array = vectorizer.fit_transform(df["Sub-Objects"])
objects_array = objects_array.toarray()

vectorizer = CountVectorizer()
tags_array = vectorizer.fit_transform(df["Tags"])
tags_array = tags_array.toarray()

# make the arrays into lists and add them into their own columns in a new dataframe
df_array = pd.DataFrame()
df_array["Number"] = df["Number"]
df_array = df_array.set_index("Number")
df_array["Tags Array"] = list(tags_array)
df_array["Objects Array"] = list(objects_array)
df_array.head()

Unnamed: 0_level_0,Tags Array,Objects Array
Number,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [126]:
# number array matrix
# all x features to be counted
x_feat_list = ["Pos Ratings", "All Ratings", "Word Count", "Safe", "Euclid", "Keter", "Thaumiel", "Anomalous", "Neutral", 
             "D-Class", "Containment Breach", "Addendum", "Task Force", "O5 Council", "Agent", "[EXPUNGED]", "[REDACTED]", "Blacked Out"]

# normalize the x features in new dataframe
df_norm = pd.DataFrame()
for feat in x_feat_list:
    df_norm[feat] = df[feat] / df[feat].std()

    
# extract data into numpy format
num_array = df_norm.values

# add numpy array to df_array
df_array["Num Array"] = list(num_array)
df_array.head()

Unnamed: 0_level_0,Tags Array,Objects Array,Num Array
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6.989315534162876, 7.657336364724377, 0.47474..."
3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[3.143665938946621, 3.665524569877127, 0.99103..."
4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[4.402658365951942, 4.651328526169839, 0.80435..."
5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.7087412823447825, 4.4014063964054895, 0.206..."
6.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.6171781967443954, 4.207022517699884, 0.2719..."


In [253]:
def get_similarities(scp, df_array):
    ''' # gets cosin similarities between given scp and all other scps, returns scps with closest similarities
    Args:
        scp (int): the number of the scp you want similar ones to
        df_array (dataframe): a dataframe with arrays representing the tags, subobjects, and other values from the original 
                                dataframe containing all scps
    Returns:
        similarities (dataframe): the similarity scores for each scp to the given scp
    '''
    # get vectors for the given scp
    tag_vec = df_array.loc[scp]["Tags Array"]
    obj_vec = df_array.loc[scp]["Objects Array"]
    num_vec = df_array.loc[scp]["Num Array"]
   
    # get similarity to the given scp for each scp in the dataframe, store in dataframe with scp number
    # create dataframe with a column as the scp number
    similarities = pd.DataFrame(columns = ["Number", "Similarity"])
    
    # for each scp, calculate the similarity score and put into Series
    for idx, row in df_array.iterrows():
       # get vectors for current scp
        tag2_vec = df_array.loc[idx]["Tags Array"]
        obj2_vec = df_array.loc[idx]["Objects Array"]
        num2_vec = df_array.loc[idx]["Num Array"]
        
        # calculate similarities for given scp and each scp
        tag_sim = cosine_similarity(tag_vec.reshape(1, -1), tag2_vec.reshape(1, -1))
        obj_sim = cosine_similarity(obj_vec.reshape(1, -1), obj2_vec.reshape(1, -1))
        num_sim = cosine_similarity(num_vec.reshape(1, -1), num2_vec.reshape(1, -1))
        
        # calculate overall similarity by adding
        # we could weight these if we wanted to
        overall_sim = tag_sim + obj_sim + num_sim
        
        # add similarites together and put into dataframe
        similarities = similarities.append({"Number" : idx, "Similarity": overall_sim}, ignore_index=True)
    
    # sort the similarites descending
    similarities.sort_values(by=['Similarity'], ascending = [False], inplace=True)
    
    return(similarities)

In [255]:
def recommend_scp(scp, df_array, num_recs = 5):
    '''recommends a certain number of scps based on a given scp
    Args:
        scp (int): number of scp
        df_array (dataframe): dataframe with arrays representing the tags, subobjects, and other values from the original 
                                dataframe containing all scps
        num_recs (int): the number of recommendations to return
    Returns:
        links (list): a list with links for all recommended scps
    
    '''
    # possibly put part in for what to do if scp not in the array
    
    # get the similarities
    similarities = get_similarities_better(scp, df_array)
    
    # get top (num_recs) in the similarities
    recs = list(similarities.iloc[1:num_recs + 1]["Number"])
    
    # create links to the top (rec) similar ones
    links = []
    for i in recs:
        # make sure scp number is in proper format for link
        # turn float into int and string
        i = str(int(i))
        # if the number is less than 3 numbers long, must add zeros in front
        while len(i) < 3:
            i = '0' + i   
            
        # create link  
        link = f'https://scp-wiki.wikidot.com/scp-{i}'
        links.append(link)
        
    
    # return the top links
    return(links)

In [256]:
recs = recommend_scp(680, df_array, 10)
print(recs)

['https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680', 'https://scp-wiki.wikidot.com/scp-680']


In [251]:
x = list(recs.iloc[1:20 + 1]["Number"])
print(x)
links = []
for i in x:
        # make sure scp number is in proper format for link
        # turn float into int and string
        i = str(int(i))
        # if the number is less than 3 numbers long, must add zeros in front
        while len(i) < 3:
            i = '0' + i   
            
        # create link  
        link = f'https://scp-wiki.wikidot.com/scp-{i}'
        links.append(link)
        
print(links)

[6899.0, 1981.0, 701.0, 87.0, 5287.0, 1171.0, 513.0, 22.0, 962.0, 2030.0, 2066.0, 186.0, 1782.0, 2000.0, 3799.0, 7700.0, 4748.0, 1360.0, 1230.0, 1719.0]
['https://scp-wiki.wikidot.com/scp-6899', 'https://scp-wiki.wikidot.com/scp-1981', 'https://scp-wiki.wikidot.com/scp-701', 'https://scp-wiki.wikidot.com/scp-087', 'https://scp-wiki.wikidot.com/scp-5287', 'https://scp-wiki.wikidot.com/scp-1171', 'https://scp-wiki.wikidot.com/scp-513', 'https://scp-wiki.wikidot.com/scp-022', 'https://scp-wiki.wikidot.com/scp-962', 'https://scp-wiki.wikidot.com/scp-2030', 'https://scp-wiki.wikidot.com/scp-2066', 'https://scp-wiki.wikidot.com/scp-186', 'https://scp-wiki.wikidot.com/scp-1782', 'https://scp-wiki.wikidot.com/scp-2000', 'https://scp-wiki.wikidot.com/scp-3799', 'https://scp-wiki.wikidot.com/scp-7700', 'https://scp-wiki.wikidot.com/scp-4748', 'https://scp-wiki.wikidot.com/scp-1360', 'https://scp-wiki.wikidot.com/scp-1230', 'https://scp-wiki.wikidot.com/scp-1719']
