In [1]:
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter, defaultdict
import pickle
import re
from tqdm.notebook import tqdm, trange

In [2]:
from dataloader import *

In [166]:
import unidecode

blacklist = {'relationship', 'mention', 'mentions', 'mentioned', 
             'appeared', 'appear', 'appears', 'all', 
             'everyone', 'i', 'lol', 'how', 'you'}

def strip_a_character(char):
    char = char.split(' | ')[0]
    
    char = re.sub(r'(.*) - character', '', char)
    char = re.sub(r'(.*) - Character', '', char)
    char = re.sub(r'\s?\([^)]*\)', '', char)
    
    char = char.rstrip(' ').lstrip(' ')

    valid = True

    if len(char) > 30:
        valid = False
    if set(char.split(' ')) & blacklist:
        valid = False
    if len(char) < 2 or char[0] in {'[', '?', '('}:
        valid = False

    if not valid or len(char) < 1:
        return 'unknown'

    return unidecode.unidecode(char)

new_characters = [[strip_a_character(char) for char in c_list] for c_list in characters]
characters_vocab = Counter(chain.from_iterable(new_characters))

valid_characters = [char for char, count in characters_vocab.items() if count > 20 and char != 'unknown']

characters_list = sorted(valid_characters)
char_to_ix = {char : i for i, char in enumerate(characters_list)}

In [167]:
all_fandoms = sorted(list(set(chain.from_iterable(fandoms))))
fandom_to_ix = {f : i for i, f in enumerate(all_fandoms)}
fandom_freq = Counter(chain.from_iterable(fandoms))
sorted(fandom_freq.items(), key=lambda x:-x[1])[:10]

[('僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia', 4444),
 ('Minecraft (Video Game)', 4045),
 ('Video Blogging RPF', 4037),
 ('Marvel Cinematic Universe', 3238),
 ('Haikyuu!!', 3199),
 ('Harry Potter - J. K. Rowling', 3189),
 ('방탄소년단 | Bangtan Boys | BTS', 2892),
 ('原神 | Genshin Impact (Video Game)', 2231),
 ('Star Wars - All Media Types', 1976),
 ('Supernatural', 1593)]

In [168]:
len(all_fandoms)

7446

In [170]:
co_mat = np.zeros(shape=(len(valid_characters), len(all_fandoms)))

for i, c_list in enumerate(new_characters):
    for c in c_list:
        if c in char_to_ix:
            for f in fandoms[i]:
                co_mat[char_to_ix[c], fandom_to_ix[f]] += 1

co_mat = co_mat / np.array([characters_vocab[char] for char in characters_list]).reshape(-1,1)

In [171]:
poi = 'Quackity'

for f in np.argsort(-co_mat[char_to_ix[poi]])[:10]:
    print(all_fandoms[f], co_mat[char_to_ix[poi], f])

Minecraft (Video Game) 0.6666666666666666
Dream SMP - Fandom 0.4166666666666667
Video Blogging RPF 0.2916666666666667
DreamSMP 0.16666666666666666
Dream SMP (Video Blogging RPF) 0.16666666666666666
Minecraft - Fandom 0.08333333333333333
Technoblade - Fandom 0.041666666666666664
Origins SMP - Fandom 0.041666666666666664
Dreamwastaken 0.041666666666666664
dream - Fandom 0.041666666666666664


In [172]:
char_to_fandoms = {}
for char in characters_list:
    char_to_fandoms[char] = [all_fandoms[f_ix] for f_ix in np.argsort(-co_mat[char_to_ix[char]])[:10] \
                                if co_mat[char_to_ix[char], f_ix] > 0.2]

In [173]:
char_to_fandoms['Hisoka']

['Hunter X Hunter']

In [174]:
from itertools import combinations
def strip_a_cp(rls, char_to_ix):
    splitters = ['/', ' & ', ' x ', ' X ', ' and ', '  &; ', ' - ']
    for s in splitters:
        if s in rls:
            rls = rls.split(s)
            break

    if type(rls) != list or len(rls) < 2:
        return [('unknown', 'unknown')] # tuple()

    output = []
    for char in rls:
        char = strip_a_character(char)
        if char in char_to_ix:
            output.append(char)
    
    if len(output) < 2:
        return [('unknown', 'unknown')] # tuple()

    res = []
    for a, b in combinations(output, 2):
        res.append(tuple(sorted([a, b])))
        
    return res

new_cps = [list(chain.from_iterable([strip_a_cp(rls, char_to_ix) for rls in cp_list])) for cp_list in cps]
cp_freq = Counter(chain.from_iterable(new_cps))

In [175]:
char_cp_freq = defaultdict(lambda: defaultdict(int))
total_freq = defaultdict(int)
for cp_list in new_cps:
    for a, b in cp_list:
        if a != 'unknown' and b != 'unknown':
            char_cp_freq[a][b] += 1
            char_cp_freq[b][a] += 1
            total_freq[a] += 1
            total_freq[b] += 1
            
for char, cp_freq in char_cp_freq.items():
    for other_char, count in cp_freq.items():
        cp_freq[other_char] /= total_freq[char]

In [176]:
related_characters = {}

for char, cp_freq in char_cp_freq.items():
    for other_char, count in cp_freq.items():
        if char not in related_characters:
            related_characters[char] = []
        if count > 0.05:
            related_characters[char].append(other_char)
    related_characters[char].sort(key=lambda x:-char_cp_freq[char][x])

In [177]:
related_characters['Hermione Granger']

['Draco Malfoy', 'Ron Weasley', 'Harry Potter', 'Severus Snape']

In [178]:
len(related_characters)

2657

In [179]:
characters_data = []
for char in characters_list:
    entry = {
        'name': char,
        'popularity': characters_vocab[char],
        'fandoms': ','.join(char_to_fandoms[char]),
        'related_char': ','.join(related_characters[char]) if char in related_characters else ''
    }
    characters_data.append(entry)

In [185]:
characters_df = pd.DataFrame(characters_data)

In [186]:
characters_df.head(20)

Unnamed: 0,name,popularity,fandoms,related_char
0,501st Legion,22,Star Wars: The Clone Wars (2008) - All Media T...,"Ahsoka Tano,CT-7567,Anakin Skywalker,Obi-Wan K..."
1,5up,25,"Video Blogging RPF,Minecraft (Video Game)","Floris,Toby Smith,Sam,Clay"
2,707,41,Mystic Messenger (Video Game),"Main Character,Reader,Choi Saeran"
3,A-Qing,45,"陈情令 | The Untamed (TV),魔道祖师 - 墨香铜臭 | Módào Zǔs...","Xiao Xingchen,Song Lan,Xue Yang,Ouyang Zizhen"
4,AJ Wilson,76,"The Falcon and the Winter Soldier (TV),Marvel ...","Cass Wilson,Sam Wilson,Sarah Wilson,James ""Buc..."
5,Aang,266,Avatar: The Last Airbender,"Katara,Zuko,Sokka,Toph Beifong"
6,Aaron Burr,21,Hamilton - Miranda,"Alexander Hamilton,John Laurens,Hercules Mulli..."
7,Aaron Hotchner,280,Criminal Minds (US TV),"Spencer Reid,Reader,Emily Prentiss,Derek Morga..."
8,Aaron Minyard,67,All For The Game - Nora Sakavic,"Andrew Minyard,Kevin Day,Nicky Hemmick,Neil Jo..."
9,Aayla Secura,31,Star Wars: The Clone Wars (2008) - All Media T...,"CC-5052,Anakin Skywalker,Quinlan Vos"


In [210]:
fandom_df = pickle.load(open('data/fandom_df.p', 'rb'))

In [207]:
fandom_df.reset_index(level=0, inplace=True)

In [199]:
fandom_df = fandom_df.drop(columns=['percentage'])

In [201]:
fandom_df = fandom_df.set_index('ao3_parsed_name')

In [211]:
fandom_df

Unnamed: 0,ao3_parsed_name,ao3_name,count,imdb_name,media_type,genres,votes,start_year
0,My Hero Academia,僕のヒーローアカデミア | Boku no Hero Academia | My Hero ...,4468,My Hero Academia,tv,"[Action, Adventure, Animation]",40037,2016
1,Minecraft,Minecraft (Video Game),4082,Minecraft,video_game,"[Action, Adventure, Family]",8031,2009
2,Video Blogging RPF,Video Blogging RPF,4066,,,,,
3,Marvel Cinematic Universe,Marvel Cinematic Universe,3260,,,,,
4,Haikyuu!!,Haikyuu!!,3224,Haikyuu!!,tv,"[Animation, Comedy, Drama]",13258,2014
...,...,...,...,...,...,...,...,...
7441,A Mother's Nightmare,A Mother's Nightmare (2012),1,A Mother's Nightmare,tv,"[Crime, Mystery, Thriller]",1999,2012
7442,Sword Art Online: Integral Factor,Sword Art Online: Integral Factor (Video Game),1,,,,,
7443,Outriders,Outriders - Fandom,1,The Outriders,movie,[Western],551,1950
7444,General Hospital,General Hospital,1,General Hospital,tv,"[Crime, Drama, Romance]",4360,1963


In [245]:
fandom_df.iloc[[0,1,2]]['ao3_name'].values.tolist()

['僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia',
 'Minecraft (Video Game)',
 'Video Blogging RPF']

In [225]:
entries = [fandom_df.loc[fandom_df['ao3_parsed_name'] == f] for f in ['Haikyuu!!', 'SMPEarth']]

In [259]:
fandom_df.loc[fandom_df['ao3_parsed_name'] == 'xdff']['ao3_name'].values.tolist()

[]

In [263]:
np.array([0, 0, 0]) + 0

array([0, 0, 0])

In [209]:
pickle.dump(characters_df, open('data/character_df.p', 'wb'))
pickle.dump(fandom_df, open('data/fandom_df.p', 'wb'))

In [213]:
c_df = pickle.load(open('data/character_df.p', 'rb'))

In [214]:
c_df

Unnamed: 0,name,popularity,fandoms,related_char
0,501st Legion,22,Star Wars: The Clone Wars (2008) - All Media T...,"Ahsoka Tano,CT-7567,Anakin Skywalker,Obi-Wan K..."
1,5up,25,"Video Blogging RPF,Minecraft (Video Game)","Floris,Toby Smith,Sam,Clay"
2,707,41,Mystic Messenger (Video Game),"Main Character,Reader,Choi Saeran"
3,A-Qing,45,"陈情令 | The Untamed (TV),魔道祖师 - 墨香铜臭 | Módào Zǔs...","Xiao Xingchen,Song Lan,Xue Yang,Ouyang Zizhen"
4,AJ Wilson,76,"The Falcon and the Winter Soldier (TV),Marvel ...","Cass Wilson,Sam Wilson,Sarah Wilson,James ""Buc..."
...,...,...,...,...
2701,Zim,31,Invader Zim,"Dib,Original Character,Reader"
2702,Ziva David,22,NCIS,"Anthony DiNozzo,Jethro Gibbs,Timothy McGee,Ell..."
2703,Zolf Smith,60,Rusty Quill Gaming (Podcast),"Oscar Wilde,Sasha Racket,Hamid Saleh Haroun al..."
2704,Zoya Nazyalensky,54,"The Grisha Trilogy - Leigh Bardugo,Nikolai Ser...","Nikolai Lantsov,Genya Safin,Alina Starkov"
