In [1]:
import os
import re
import json
import random
import traceback
import pickle as pkl
from datetime import datetime

import pytz
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt


from download_submissions import pprint_tree



In [2]:
%matplotlib inline

In [3]:
plt.style.use('seaborn-colorblind')

In [4]:
from common import load_submission, load_submissions, load_rows
from common import Row

In [6]:
folder = "./2018-2019/"
rows = load_rows(folder)

100%|██████████| 211078/211078 [03:50<00:00, 915.02it/s] 


In [7]:
df = pd.DataFrame([vars(r) for r in rows])

In [8]:
# view all the statuses
df.status.value_counts()

Solved!                           70565
removed                           57557
unknown                           43778
Open                              26374
Open.                              5397
Solved                             3712
Solved.                            3538
Pending                             147
Announcement                          4
Lazy music                            2
Belongs on different sub              1
Lacking info                          1
Solved: "Found Footage" (2016)        1
deleted by OP                         1
Name: status, dtype: int64

In [9]:
# build map 
status_dict_helper = {
    "solved": ("Solved!", "Solved", "Solved."),
    "removed": ("removed", "deleted by OP"),
    "open": ("Open", "Open."),
    "other": ("Pending", "Announcement", "Lazy music",
                "Belongs on different sub", 'Solved: "Found Footage" (2016)',
               "Lacking info"),
    "unknown": ("unknown", )
}
# invert dict for ease of use
status_dict = {}
for key, values in status_dict_helper.items():
    for v in values:
        status_dict[v] = key

for status in np.unique(df.status.values):
    status_dict[status] 
        
status_dict

{'Solved!': 'solved',
 'Solved': 'solved',
 'Solved.': 'solved',
 'removed': 'removed',
 'deleted by OP': 'removed',
 'Open': 'open',
 'Open.': 'open',
 'Pending': 'other',
 'Announcement': 'other',
 'Lazy music': 'other',
 'Belongs on different sub': 'other',
 'Solved: "Found Footage" (2016)': 'other',
 'Lacking info': 'other',
 'unknown': 'unknown'}

In [10]:
# save status_dict
with open("./meta_data/status_dict.json", "w") as writer:
    json.dump(status_dict, writer)

In [11]:
# view all the categories
print(f"Total Categories: {len(np.unique(df.category.values))}")

Total Categories: 13461


In [12]:
# too many! so pick the 100 most frequent
from collections import Counter
categories = set(df.category.apply(lambda _: _.lower()).values)
category_counts = Counter(df.category.apply(lambda _: _.lower()).values)
# focus on only the 100 most commonly occuring categories
with open("most_freq_categories.csv", "w") as writer:
    for k, _ in category_counts.most_common(100):
        writer.write(f"{k}\n")

In [13]:
# these were manually categorized and saved in 
manual_categorization = pd.read_csv("./TOMT_Dataset_Categorization.tsv", sep="\t")

In [14]:
category_dict_helper = {}
for column in manual_categorization.columns:
    category_dict_helper[column.lower()] = list(manual_categorization[column].dropna().values)
category_dict = {}
for key, values in category_dict_helper.items():
    for v in values:
        category_dict[v] = key
category_dict

{'music': 'music',
 'song': 'music',
 'album': 'music',
 'band': 'music',
 "children's book": 'music',
 'song/music video': 'music',
 'musicvideo': 'music',
 'book': 'book',
 'novel': 'book',
 'short story': 'book',
 'story': 'book',
 'webcomic': 'book',
 'manga': 'book',
 'article': 'book',
 'book series': 'book',
 'children’s book': 'book',
 'books': 'book',
 'movie': 'movie/tv',
 'tv show': 'movie/tv',
 'cartoon': 'movie/tv',
 'tv': 'movie/tv',
 'show': 'movie/tv',
 'anime': 'movie/tv',
 'film': 'movie/tv',
 'movies': 'movie/tv',
 'movie/tv': 'movie/tv',
 'animation': 'movie/tv',
 'tv series': 'movie/tv',
 'documentary': 'movie/tv',
 'movie/tv show': 'movie/tv',
 'short film': 'movie/tv',
 'tvshow': 'movie/tv',
 'movie or tv show': 'movie/tv',
 'movie?': 'movie/tv',
 'movie/show': 'movie/tv',
 'horror movie': 'movie/tv',
 'animated movie': 'movie/tv',
 'tv episode': 'movie/tv',
 'tv/movie': 'movie/tv',
 'television': 'movie/tv',
 'kids show': 'movie/tv',
 'kids tv show': 'movie/tv',

In [15]:
# save status_dict
with open("./meta_data/category_dict.json", "w") as writer:
    json.dump(category_dict, writer)