In [23]:
"""
Takes the CSV export of the `mal-user-animelists` table and prcesses it into the same format
as the original CSV ratings from Kaggle
"""

import json
import csv
import sys
from datetime import datetime

csv.field_size_limit(sys.maxsize)

def parse_datetime(raw: str) -> datetime:
    return datetime.strptime(raw, "%Y-%m-%dT%H:%M:%S%z")

# By default, we only take ratings from users that have rated an anime relatively recently
min_latest_updated = parse_datetime("2020-08-10T00:18:32+00:00")

all_anime_ids = set()
retained_user_count = 0
skipped_user_count = 0
total_retained_rating_count = 0
debug_count = 100

# Data exported from MySQL
with open('./work/data/mal-user-animelists.csv') as rf:
    reader = csv.reader(rf, delimiter=',', quotechar='"', escapechar='\\')
    
    with open('./work/data/collected_animelists.csv', 'wt', newline='') as wf:
        writer = csv.writer(wf, delimiter=',', quotechar='"')
        writer.writerow(['username', 'anime_id', 'my_score', 'status', 'start_date', 'end_date'])
        
        i = -1
        for row in reader:
            i += 1
            # skip header row
            if i == 0:
                # continue
                pass
            
            if i % 10000 == 0:
                print(f"Processed {i} user profiles")

            username = row[0]
            parsed = json.loads(row[1])
            any_recently_updated = False

            for node in parsed:
                any_recently_updated = any_recently_updated or parse_datetime(node['list_status']['updated_at']) > min_latest_updated

            if not any_recently_updated:
                skipped_user_count += 1
                continue

            for node in parsed:
                my_score = node['list_status'].get('score', 0)
                if my_score == 0:
                    continue
                anime_id = node['node'].get('id')
                status = node['list_status'].get('status')
                if status is None or status is None or anime_id is None:
                    if debug_count < 10:
                        print(node)
                        debug_count += 1
                    continue
                all_anime_ids.add(anime_id)
                start_date = node['list_status'].get('start_date', '')
                end_date = node['list_status'].get('end_date', '')
                writer.writerow([username, anime_id, my_score, status, start_date, end_date])

            total_retained_rating_count += len(parsed)
            retained_user_count += 1

print(f"Retained {retained_user_count} user profiles with {total_retained_rating_count} ratings")
print(f"Skipped {skipped_user_count} user profiles")

Processed 0 user profiles
Processed 10000 user profiles
Processed 20000 user profiles
Processed 30000 user profiles
Processed 40000 user profiles
Processed 50000 user profiles
Processed 60000 user profiles
Processed 70000 user profiles
Processed 80000 user profiles
Processed 90000 user profiles
Processed 100000 user profiles
Processed 110000 user profiles
Processed 120000 user profiles
Processed 130000 user profiles
Processed 140000 user profiles
Processed 150000 user profiles
Processed 160000 user profiles
Processed 170000 user profiles
Processed 180000 user profiles
Processed 190000 user profiles
Processed 200000 user profiles
Processed 210000 user profiles
Processed 220000 user profiles
Processed 230000 user profiles
Processed 240000 user profiles
Processed 250000 user profiles
Processed 260000 user profiles
Processed 270000 user profiles
Processed 280000 user profiles
Processed 290000 user profiles
Processed 300000 user profiles
Processed 310000 user profiles
Processed 320000 user 

In [13]:
with open('./work/data/all-anime-ids.json', 'wt') as f:
    f.write(json.dumps(list(all_anime_ids)))

In [22]:
with open('./work/data/mal-user-animelists.csv') as rf:
    reader = csv.reader(rf, delimiter=',', quotechar='"', escapechar='\\')

    with open('./work/data/converted-mal-user-animelists.csv', 'wt', newline='') as wf:
        writer = csv.writer(wf, delimiter=',', quotechar='"')
        writer.writerow(['username', 'animelist_json'])

        for row in reader:
            username = row[0]
            parsed = json.loads(row[1])

            for item in parsed:
                if item['node'].get('main_picture') is not None:
                    del item['node']['main_picture']
                if item['node'].get('title') is not None:
                    del item['node']['title']
                if item['list_status'].get('num_episodes_watched') is not None:
                    del item['list_status']['num_episodes_watched']
                if item['list_status'].get('is_rewatching') is not None:
                    del item['list_status']['is_rewatching']
            
            writer.writerow([username, json.dumps(parsed)])