# Generating a list of Animeplanet usernames
* Looks at character likes/dislikes

In [None]:
import random
import re

In [None]:
name = "get_character_usernames"
source = "animeplanet"

In [None]:
PROXY_PARTITION = "1,2"
%run WebEndpointBase.ipynb

In [None]:
# if we rerunning the notebook, then resume execution where we last left off
usernames = set()
if os.path.exists("character_usernames.txt"):
    with open("character_usernames.txt") as f:
        usernames = {x.strip() for x in f.readlines() if x.strip()}

visited = set()
if os.path.exists("character_visited.txt"):
    with open("character_visited.txt") as f:
        visited = {x.strip() for x in f.readlines() if x.strip()}

logger.info(
    f"Starting with {len(usernames)} stored usernames and {len(visited)} visited characters"
)

In [None]:
def get_characters():
    urls = [
        ("https://www.anime-planet.com/characters/", "/characters/"),
        ("https://www.anime-planet.com/characters/top-loved", "/characters/"),
        ("https://www.anime-planet.com/characters/top-loved/today", "/characters/"),
        ("https://www.anime-planet.com/characters/top-loved/week", "/characters/"),
        ("https://www.anime-planet.com/characters/top-hated", "/characters/"),
        ("https://www.anime-planet.com/characters/top-hated/today", "/characters/"),
        ("https://www.anime-planet.com/characters/top-hated/week", "/characters/"),        
    ]
    characters = set()
    for entry in urls:
        url, prefix = entry
        response = call_api(url)
        if response.status_code in [404]:
            logger.warning(f"Error {response} received when handling {url}")
            continue
        if not response.ok:
            logger.warning(f"Error {response} received when handling {url}")
            continue
        urls = re.findall(prefix + """[^"/#%?.']+""", response.text)
        characters |= {x[len(prefix) :] for x in urls}
    return characters

In [None]:
def order_chars(characters, visited):
    order = list(characters - visited)
    random.shuffle(order)
    return order

In [None]:
def get_users_on_url(url):
    response = call_api(url)
    if response.status_code in [404]:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    urls = re.findall("""/users/[^"/#%?.']+""", response.text)
    users = {x[len("/users/") :] for x in urls}
    return users

In [None]:
def get_users_on_base_url(base_url):
    users = set()
    page = 1
    while True:
        url = base_url + f"?page={page}"
        new_users = get_users_on_url(url)
        if len(users) == len(users | new_users):
            break
        users |= new_users
        page += 1
    return users

In [None]:
def get_users(character):
    users = [
        get_users_on_base_url(f"https://www.anime-planet.com/characters/{character}/{x}")
        for x in ["loves", "hates"]
    ]
    return set().union(*users)

In [None]:
def get_recent_users():
    urls = [
        ("https://www.anime-planet.com/community/", "/users/"),
        ("https://www.anime-planet.com/users/recent_user_reviews.php?mode=anime", "/users/"),        
        ("https://www.anime-planet.com/users/recent_user_reviews.php?mode=manga", "/users/"),
        ("https://www.anime-planet.com/users/recent_recommendations.php?filter=anime&page=1", "/users/"),                
        ("https://www.anime-planet.com/users/recent_recommendations.php?filter=manga&page=1", "/users/"),
        ("https://www.anime-planet.com/forum/", "members/"),
        ("https://www.anime-planet.com/forum/recent-activity/", "members/"),                
        ("https://www.anime-planet.com/forum/members/", "members/"),        
        ("https://www.anime-planet.com/forum/find-new/1/posts", "members/"),
        ("https://www.anime-planet.com/forum/find-new/1/profile-posts", "members/"),
        ("https://www.anime-planet.com/forum/online/?type=registered", "members/"),
    ]
    users = set()
    for entry in urls:
        url, prefix = entry
        response = call_api(url)
        if response.status_code in [404]:
            logger.warning(f"Error {response} received when handling {url}")
            continue
        if not response.ok:
            logger.warning(f"Error {response} received when handling {url}")
            continue
        urls = re.findall(prefix + """[^"/#%?.']+""", response.text)
        users |= {x[len(prefix) :] for x in urls}
    return users

In [None]:
while True:
    characters = get_characters()
    order = order_chars(characters, visited)
    timestamp = time.time()
    for char in order:
        num_usernames = len(usernames)  
        users = get_users(char)
        visited.add(char)
        usernames |= users
        if time.time() - timestamp > 3600:
            usernames |= get_recent_users()
            timestamp = time.time()
        if len(usernames) > num_usernames:
            atomic_to_csv(sorted(list(usernames)), "character_usernames.txt")
            atomic_to_csv(sorted(list(visited)), "character_visited.txt")
            logger.info(
                f"Successfully written {len(usernames)} users and visited {len(visited)} characters"
            )