In [1]:
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter, defaultdict
import pickle
import re
from tqdm.notebook import tqdm, trange

In [11]:
from dataloader import *

In [12]:
imdb = pd.read_csv('data/imdb.tsv',  sep='\t', header=0)
rating = pd.read_csv('data/ratings.tsv', sep='\t', header=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
imdb = imdb.merge(rating, how='left', on=['tconst'])
imdb = imdb[imdb['numVotes'].notna()]
imdb = imdb[~imdb['titleType'].isin(['tvEpisode', 'tvSpecial', 'video', 'short'])]
imdb = imdb.reset_index(drop=True)

In [16]:
imdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N,4.5,8.0
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,632.0
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,6.0,5.0
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,14.0
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,12.0
...,...,...,...,...,...,...,...,...,...,...,...
412029,tt9916362,movie,Coven,Akelarre,0,2020,\N,92,"Adventure,Drama,History",6.3,3218.0
412030,tt9916380,tvSeries,Meie aasta Aafrikas,Meie aasta Aafrikas,0,2019,\N,43,"Adventure,Comedy,Family",9.0,104.0
412031,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,\N,\N,"Adventure,History,War",3.8,12.0
412032,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019,\N,\N,Comedy,9.3,15.0


In [16]:
imdb_titles = imdb['primaryTitle'].tolist()

In [18]:
np.unique(imdb['titleType'])

array(['movie', 'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvShort',
       'videoGame'], dtype=object)

In [17]:
def search_for_closest(query):
    ids = []
    for idx, t in enumerate(imdb_titles):
        if query in str(t):
            ids.append(idx)
    ranked = imdb.iloc[ids].sort_values(by='numVotes', ascending=False)
    if len(ranked) > 0:
        return ranked.iloc[0].to_dict()
    return {}

In [18]:
fandom_count = Counter(chain.from_iterable(fandoms))
fandom_total = len(fandom_count)

In [19]:
fandom_total

7446

In [20]:
def parse_fandom(fandom):
    f = fandom.split(' | ')[-1]
    match_obj = re.match(r'([^\(\)\-]+)($|\(.+\)|- All Media Types)?', f)
    if not match_obj:
        return f
    else:
        return match_obj[1].strip(' '), match_obj[2].replace(')', '').replace('(', '') if match_obj[2] else ''

In [21]:
data = []
for fandom, count in tqdm(sorted(fandom_count.items(), key=lambda x:-x[1])):
    parsed = parse_fandom(fandom)
    res = search_for_closest(parsed[0])
    if res:
        res['numVotes'] = int(res['numVotes'])
        res['genres'] = res['genres'].split(',')
        if 'tv' in res['titleType']:
            res['titleType'] = 'tv'
        elif 'video' in res['titleType']:
            res['titleType'] = 'video_game'
    
    entry = {
        'ao3_name': fandom,
        'ao3_parsed_name': parsed[0],
        'count': count,
        'percentage': round(count / fandom_total, 4),
        'imdb_name': res.get('primaryTitle', ''),
        'media_type': res.get('titleType', ''),
        'genres': res.get('genres', ''),
        'votes': res.get('numVotes', ''),
        'start_year': res.get('startYear', ''),
    }
    data.append(entry)

  0%|          | 0/7446 [00:00<?, ?it/s]

In [24]:
fandom_df = pickle.load(open('data/fandom_df.p', 'rb'))

In [25]:
fandom_df

Unnamed: 0,ao3_name,ao3_parsed_name,count,percentage,imdb_name,media_type,genres,votes,start_year
0,僕のヒーローアカデミア | Boku no Hero Academia | My Hero ...,My Hero Academia,4468,0.6001,My Hero Academia,tv,"[Action, Adventure, Animation]",40037,2016
1,Minecraft (Video Game),Minecraft,4082,0.5482,Minecraft,video_game,"[Action, Adventure, Family]",8031,2009
2,Video Blogging RPF,Video Blogging RPF,4066,0.5461,,,,,
3,Marvel Cinematic Universe,Marvel Cinematic Universe,3260,0.4378,,,,,
4,Haikyuu!!,Haikyuu!!,3224,0.4330,Haikyuu!!,tv,"[Animation, Comedy, Drama]",13258,2014
...,...,...,...,...,...,...,...,...,...
7441,A Mother's Nightmare (2012),A Mother's Nightmare,1,0.0001,A Mother's Nightmare,tv,"[Crime, Mystery, Thriller]",1999,2012
7442,Sword Art Online: Integral Factor (Video Game),Sword Art Online: Integral Factor,1,0.0001,,,,,
7443,Outriders - Fandom,Outriders,1,0.0001,The Outriders,movie,[Western],551,1950
7444,General Hospital,General Hospital,1,0.0001,General Hospital,tv,"[Crime, Drama, Romance]",4360,1963


In [110]:
np.sum(fandom_df['imdb_name'] != '') / len(fandom_df)

0.5515713134568896

In [4]:
genres = list(zip(fandom_df['genres'].tolist(), fandom_df['media_type'].tolist()))
genres = [g1 + [g2] if g1 != '' else [] for g1, g2 in genres]

count = np.log2(np.array(fandom_df['count'].values)).reshape(-1, 1)
votes = np.log2(np.array([int(v) if v else 1e-6 for v in fandom_df['votes'].values])).reshape(-1, 1)

year = np.array([int(y) if y != '' else 2000 for y in fandom_df['start_year'].values]).reshape(-1, 1)

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
G = mlb.fit_transform(genres)

G_c = np.concatenate([G, count * 0.4, year * 0.3, votes * 0.3], axis=1)

In [6]:
mlb.classes_

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport',
       'Talk-Show', 'Thriller', 'War', 'Western', '\\N', 'movie', 'tv',
       'video_game'], dtype=object)

In [7]:
G_c.shape

(7015, 35)

In [8]:
fandom_df.loc[fandom_df['ao3_name'] == 'Shadow and Bone (TV)']

Unnamed: 0,ao3_name,count,percentage,imdb_name,media_type,genres,votes,start_year,parsed_ao3_name
132,Shadow and Bone (TV),138,0.0197,Shadow and Bone,tv,"[Action, Adventure, Drama]",38434,2021,Shadow and Bone


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

cs = cosine_similarity(G_c, G_c[14].reshape(1, -1)).flatten()
fandom_df.iloc[np.argsort(-cs)[:10]]

Unnamed: 0,ao3_name,count,percentage,imdb_name,media_type,genres,votes,start_year,parsed_ao3_name
14,呪術廻戦 | Jujutsu Kaisen (Manga),1013,0.1444,Jujutsu Kaisen,tv,"[Action, Adventure, Animation]",16598,2020,Jujutsu Kaisen
16,呪術廻戦 | Jujutsu Kaisen (Anime),951,0.1356,Jujutsu Kaisen,tv,"[Action, Adventure, Animation]",16598,2020,Jujutsu Kaisen
30,ジョジョの奇妙な冒険 | JoJo no Kimyou na Bouken | JoJo's...,598,0.0852,JoJo's Bizarre Adventure,tv,"[Action, Adventure, Animation]",11469,2012,JoJo's Bizarre Adventure
33,RWBY,498,0.071,RWBY,tv,"[Action, Adventure, Animation]",5229,2012,RWBY
46,Voltron: Legendary Defender,389,0.0555,Voltron: Legendary Defender,tv,"[Action, Adventure, Animation]",7844,2016,Voltron: Legendary Defender
27,Star Wars: The Clone Wars (2008) - All Media T...,634,0.0904,Star Wars: The Clone Wars,tv,"[Action, Adventure, Animation]",67940,2008,Star Wars: The Clone Wars
22,Naruto,839,0.1196,Naruto: Shippûden,tv,"[Action, Adventure, Animation]",89552,2007,Naruto
0,僕のヒーローアカデミア | Boku no Hero Academia | My Hero ...,3753,0.535,My Hero Academia,tv,"[Action, Adventure, Animation]",40037,2016,My Hero Academia
54,One Piece,331,0.0472,One Piece,tv,"[Action, Adventure, Animation]",75495,1999,One Piece
9,Shingeki no Kyojin | Attack on Titan,1358,0.1936,Attack on Titan,tv,"[Action, Adventure, Animation]",226886,2013,Attack on Titan


In [102]:
pickle.dump(fandom_df, open('fandom_df.p', 'wb'))

In [103]:
cd

C:\Users\Ariel


In [77]:
df = pickle.load(open('data/meta.df', 'rb'))

In [72]:
E = np.load('matrix/E.npy')

In [73]:
E.shape

(97278, 385)

In [5]:
mat_str = []
for i in trange(E.shape[0]):
    mat_str.append(
        np.array2string(E[i], precision=7, max_line_width=10000000000, threshold=100000, separator=',', suppress_small=True)
    )

  0%|          | 0/97878 [00:00<?, ?it/s]

In [6]:
import sys
sys.getsizeof(mat_str) * 1e-6

0.824456

In [57]:
df['embed'] = mat_str

In [60]:
df = df.drop_duplicates(subset=['id'], ignore_index=True)

In [61]:
sys.getsizeof(df) * 1e-6

574.186293

In [81]:
E = df['embed'].values.tolist()

In [82]:
E = np.array([np.array([float(e) for e in arr_str.lstrip('[').rstrip(']').split(',')], dtype=np.float32) \
              for arr_str in tqdm(E)], dtype=np.float32)

  0%|          | 0/97278 [00:00<?, ?it/s]

In [83]:
sys.getsizeof(E) * 1e-6

149.41912

In [79]:
titles = df['title'].tolist()
tags = df['tags'].tolist()
fandom = df['fandoms'].tolist()

In [91]:
from sklearn.metrics.pairwise import cosine_similarity

cs = cosine_similarity(E[:,1:], E[1234,1:].reshape(1,-1)).flatten()

for doc_id in np.argsort(-cs)[:10]:
    print(doc_id, titles[doc_id])
    print(fandom[doc_id])
    print(tags[doc_id][:200])
    print()

1234 road trip with bakugou ♡
['僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia']
['Car Sex', 'Sex in a Car', 'Semi-Public Sex', 'Situational Humiliation', 'Bakugou Katsuki Swears A Lot', 'Protective Bakugou Katsuki', 'Bakusquad (My Hero Academia)', 'Vaginal Sex', 'Swearing', 'Light Masochism', 'Smut', 'Fluff and Smut', 'Light Angst', 'Explicit Sexual Content', 'Rough Sex', 'Moaning']

49994 Just Speak up, Asshole!
['僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia']
['Established Relationship', 'Threesome - F/M/M', 'Dry Humping', 'Vaginal Sex', 'Penis In Vagina Sex', 'Voyeurism', 'Exhibitionism', 'Teasing', 'Lingerie', 'Rimming', 'Rim job', 'Peeping', 'Bakugou Katsuki is Bad at Feelings', 'Pro Hero Bakugou Katsuki', 'Pro Hero Kirishima Eijirou', 'Hand Jobs', 'degradation kink', 'Face Slapping', 'Soft Bakugou Katsuki', 'But only a little']

34680 Where the Sunflowers Grow
['僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia']
['Fluff', 'Smut', 'Fluff and Smut', 'Marria

In [113]:
E.shape

(97278, 385)

In [6]:
from sqlalchemy import create_engine

engine = create_engine('postgresql+psycopg2://postgres:123456@localhost:5432/testdb')

In [2]:
import pickle

meta_df = pickle.load(open('data/meta.df', 'rb'))

In [4]:
meta_df.to_sql('ao3', if_exists='replace', con=engine)

In [3]:
vector_df = pickle.load(open('data/vector_df.p', 'rb'))

In [4]:
vector_df.head()

Unnamed: 0,embed,tags,characters,sentiment
0,"[-0.1220156, 0.5163098, 0.0168392,-0.2102881,-...","[-0.0808807, 0.4546168, 0.1934488,-0.1056849, ...","[ 0.7173135,-0.3503059, 1.6001508, 2.3220794,-...","[0.3794565,0.9205806,0.0942467]"
1,"[-0.0254342, 0.2541474, 0.0999457,-0.0829101,-...","[-0.1039117, 0.0335811, 0.3140417,-0.1341535, ...","[ 0.0411195, 0.1124795, 1.2972314, 0.2748613, ...","[0.0925136,0.6709683,0.6846397]"
2,"[ 0.0761577, 0.2086556, 0.1808579, 0.179745 ,-...","[ 0.1637748,-0.2932914, 0.0707718,-0.008202 , ...","[-0.7703701,-1.1007768,-0.3634353, 2.7148223,-...","[0.0587411,0.6919576,0.7030949]"
3,"[ 0.2387345,-0.0915566, 0.2715377,-0.0863949,-...","[-0.2427005, 0.111023 , 0.2474043,-0.1890667,-...","[ 3.0266976, 0.3480441, 2.6597486, 4.0506787,-...","[0.1476961,0.5292994,0.9346194]"
4,"[ 0.0317789, 0.3286527, 0.1339101, 0.0759615,-...","[-0.2787858, 0.4334798, 0.4876408, 0.2829095, ...","[ 1.0961045, 1.4200339, 3.0561199, 1.8264418, ...","[0.0429098,0.7382633,0.8966387]"


In [7]:
vector_df.to_sql('matrix', if_exists='replace', con=engine)

In [15]:
embed_df = pickle.load(open('data/embed_df_15p.p', 'rb'))

In [20]:
embed_df.to_sql('word2vec', if_exists='replace', con=engine)

In [11]:
import nltk
nltk.data.path

['C:\\Users\\Ariel/nltk_data',
 'C:\\Users\\Ariel\\anaconda3\\nltk_data',
 'C:\\Users\\Ariel\\anaconda3\\share\\nltk_data',
 'C:\\Users\\Ariel\\anaconda3\\lib\\nltk_data',
 'C:\\Users\\Ariel\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']