In [20]:
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter, defaultdict
import pickle
import re
from tqdm.notebook import tqdm, trange

In [101]:
from dataloader import *

In [3]:
imdb = pd.read_csv('data/imdb.tsv',  sep='\t', header=0)
rating = pd.read_csv('data/ratings.tsv', sep='\t', header=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
imdb = imdb.merge(rating, how='left', on=['tconst'])
imdb = imdb[imdb['numVotes'].notna()]
imdb = imdb[~imdb['titleType'].isin(['tvEpisode', 'tvSpecial', 'video', 'short'])]
imdb = imdb.reset_index(drop=True)

In [5]:
imdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N,4.5,8.0
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,632.0
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,6.0,5.0
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,14.0
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,12.0
...,...,...,...,...,...,...,...,...,...,...,...
412029,tt9916362,movie,Coven,Akelarre,0,2020,\N,92,"Adventure,Drama,History",6.3,3218.0
412030,tt9916380,tvSeries,Meie aasta Aafrikas,Meie aasta Aafrikas,0,2019,\N,43,"Adventure,Comedy,Family",9.0,104.0
412031,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,\N,\N,"Adventure,History,War",3.8,12.0
412032,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019,\N,\N,Comedy,9.3,15.0


In [7]:
titles = imdb['primaryTitle'].tolist()

In [97]:
np.unique(imdb['titleType'])

array(['movie', 'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvShort',
       'videoGame'], dtype=object)

In [98]:
def search_for_closest(query):
    ids = []
    for idx, t in enumerate(titles):
        if query in str(t):
            ids.append(idx)
    ranked = imdb.iloc[ids].sort_values(by='numVotes', ascending=False)
    if len(ranked) > 0:
        return ranked.iloc[0].to_dict()
    return {}

In [18]:
fandom_count = Counter(chain.from_iterable(fandoms))
fandom_total = len(fandom_count)

In [19]:
def parse_fandom(fandom):
    f = fandom.split(' | ')[-1]
    match_obj = re.match(r'([^\(\)\-]+)($|\(.+\)|- All Media Types)?', f)
    if not match_obj:
        return f
    else:
        return match_obj[1].strip(' '), match_obj[2].replace(')', '').replace('(', '') if match_obj[2] else ''

In [104]:
for fandom, count in tqdm(sorted(fandom_count.items(), key=lambda x:-x[1])):
    parsed = parse_fandom(fandom)
    res = search_for_closest(parsed[0])
    if res:
        res['numVotes'] = int(res['numVotes'])
        res['genres'] = res['genres'].split(',')
        if 'tv' in res['titleType']:
            res['titleType'] = 'tv'
        elif 'video' in res['titleType']:
            res['titleType'] = 'video_game'
    
    entry = {
        'ao3_name': fandom,
        'ao3_parsed_name': parsed[0],
        'count': count,
        'percentage': round(count / fandom_total, 4),
        'imdb_name': res.get('primaryTitle', ''),
        'media_type': res.get('titleType', ''),
        'genres': res.get('genres', ''),
        'votes': res.get('numVotes', ''),
        'start_year': res.get('startYear', ''),
    }
    data.append(entry)

In [22]:
fandom_df = pd.DataFrame(data)

In [115]:
fandom_df.head(10)

Unnamed: 0,ao3_name,count,percentage,imdb_name,media_type,genres,votes,start_year,parsed_ao3_name
0,僕のヒーローアカデミア | Boku no Hero Academia | My Hero ...,3753,0.535,My Hero Academia,tv,"[Action, Adventure, Animation]",40037.0,2016.0,My Hero Academia
1,Video Blogging RPF,3724,0.5309,,,,,,Video Blogging RPF
2,Minecraft (Video Game),3705,0.5282,Minecraft,video_game,"[Action, Adventure, Family]",8031.0,2009.0,Minecraft
3,Marvel Cinematic Universe,2737,0.3902,,,,,,Marvel Cinematic Universe
4,Harry Potter - J. K. Rowling,2704,0.3855,Harry Potter and the Deathly Hallows: Part 2,movie,"[Adventure, Drama, Fantasy]",780949.0,2011.0,Harry Potter
5,방탄소년단 | Bangtan Boys | BTS,2612,0.3723,BTS: Burn the Stage,tv,[Documentary],1411.0,2018.0,BTS
6,Haikyuu!!,2586,0.3686,Haikyuu!!,tv,"[Animation, Comedy, Drama]",13258.0,2014.0,Haikyuu!!
7,原神 | Genshin Impact (Video Game),2066,0.2945,Genshin Impact,video_game,"[Action, Adventure, Fantasy]",368.0,2020.0,Genshin Impact
8,Star Wars - All Media Types,1502,0.2141,Star Wars: Episode IV - A New Hope,movie,"[Action, Adventure, Fantasy]",1249514.0,1977.0,Star Wars
9,Shingeki no Kyojin | Attack on Titan,1358,0.1936,Attack on Titan,tv,"[Action, Adventure, Animation]",226886.0,2013.0,Attack on Titan


In [85]:
genres = list(zip(fandom_df['genres'].tolist(), fandom_df['media_type'].tolist()))
genres = [g1 + [g2] if g1 != '' else [] for g1, g2 in genres]

count = np.log2(np.array(fandom_df['count'].values)).reshape(-1, 1)
votes = np.log2(np.array([int(v) if v else 1e-6 for v in fandom_df['votes'].values])).reshape(-1, 1)

year = np.array([int(y) if y != '' else 2000 for y in fandom_df['start_year'].values]).reshape(-1, 1)

In [86]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
G = mlb.fit_transform(genres)

G_c = np.concatenate([G, count * 0.4, year * 0.3, votes * 0.3], axis=1)

In [87]:
mlb.classes_

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport',
       'Talk-Show', 'Thriller', 'War', 'Western', '\\N', 'movie', 'tv',
       'video_game'], dtype=object)

In [88]:
G_c[0]

array([  1.        ,   0.        ,   1.        ,   1.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         4.74953143, 604.8       ,   4.58671388])

In [118]:
fandom_df.loc[fandom_df['ao3_name'] == 'The Terror (TV 2018)']

Unnamed: 0,ao3_name,count,percentage,imdb_name,media_type,genres,votes,start_year,parsed_ao3_name
149,The Terror (TV 2018),120,0.0171,The Terror,tv,"[Adventure, Drama, History]",39536,2018,The Terror


In [119]:
from sklearn.metrics.pairwise import cosine_similarity

cs = cosine_similarity(G_c, G_c[149].reshape(1, -1)).flatten()
fandom_df.iloc[np.argsort(-cs)[:10]]

Unnamed: 0,ao3_name,count,percentage,imdb_name,media_type,genres,votes,start_year,parsed_ao3_name
149,The Terror (TV 2018),120,0.0171,The Terror,tv,"[Adventure, Drama, History]",39536,2018,The Terror
524,The Terror - Dan Simmons,27,0.0038,The Terror,tv,"[Adventure, Drama, History]",39536,2018,The Terror
509,The Musketeers (2014),28,0.004,The Musketeers,tv,"[Adventure, Drama]",20943,2014,The Musketeers
519,Black Sails,27,0.0038,Black Sails,tv,"[Adventure, Drama]",95164,2014,Black Sails
132,Shadow and Bone (TV),138,0.0197,Shadow and Bone,tv,"[Action, Adventure, Drama]",38434,2021,Shadow and Bone
226,Star Trek: Deep Space Nine,80,0.0114,Star Trek: Deep Space Nine,tv,"[Action, Adventure, Drama]",54411,1993,Star Trek: Deep Space Nine
277,Fate: The Winx Saga (TV),63,0.009,Fate: The Winx Saga,tv,"[Action, Adventure, Drama]",27619,2021,Fate: The Winx Saga
290,約束のネバーランド | Yakusoku no Neverland | The Promis...,59,0.0084,The Promised Neverland,tv,"[Adventure, Animation, Drama]",25994,2019,The Promised Neverland
297,Legacies (TV 2018),59,0.0084,Legacies,tv,"[Adventure, Drama, Fantasy]",20931,2018,Legacies
320,The West Wing,53,0.0076,The West Wing,tv,[Drama],66154,1999,The West Wing
