# Дополнение датасета аниме дополнительной информацией

Исходный датасет: [Kaggle](https://www.kaggle.com/datasets/a9ece97f83e99ab5955ddf7ab9c3f3a9047ba5cdbb08189b11e7243630d969d8).

Дополнительную информацию будем брать с сайта [Shikimori](https://shikimori.one/) ([пример](https://shikimori.one/animes/1575-code-geass-hangyaku-no-lelouch) аниме). В нее входит:
- название аниме на русском языке;
- описание аниме на русском языке;
- ссылка на обложку-изображение аниме;
- ссылка на само аниме на этом же сайте;
- тэг серии аниме (для связанных аниме);
- рейтинг с этого сайта.

In [1]:
import pandas as pd
import numpy as np

def extend_raw_anime(path: str) -> pd.DataFrame:
    """Расширение датафрейма с помощью заданных колонок.

    Args:
          path (str): путь к расширяемому датафрему в локальной файловой системе.

    Returns:
          pd.DataFrame: датафрейм с новыми колонками.
    """

    df = pd.read_csv("anime.csv").set_index("Rank")

    # Добавляем столбцы для дополнительной информации для заполнения в будущем
    russian_name = pd.Series(np.nan, index=df.index, name="Russian_name")
    russian_desc = pd.Series(np.nan, index=df.index, name="Russian_description")
    image_url = pd.Series(np.nan, index=df.index, name="Image_url")
    shikimori_url = pd.Series(np.nan, index=df.index, name="Shikimori_url")
    franchise = pd.Series(np.nan, index=df.index, name="Franchise")
    shikimori_rating = pd.Series(np.nan, index=df.index, name="Shikimori_rating")

    df = df.join(russian_name).join(russian_desc)\
       .join(image_url).join(shikimori_url).join(franchise).join(shikimori_rating)

    return df

In [11]:
import pandas as pd

df = pd.read_csv("data/extended_anime.csv").set_index('Rank')
df.head(2)

Unnamed: 0_level_0,Name,Japanese_name,Type,Episodes,Studio,Release_season,Tags,Rating,Release_year,End_year,...,Related_Mange,Related_anime,Voice_actors,staff,Russian_name,Russian_description,Image_url,Shikimori_url,Franchise,Shikimori_rating
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Demon Slayer: Kimetsu no Yaiba - Entertainment...,Kimetsu no Yaiba: Yuukaku-hen,TV,,ufotable,Fall,"Action, Adventure, Fantasy, Shounen, Demons, H...",4.6,2021.0,,...,Demon Slayer: Kimetsu no Yaiba,"Demon Slayer: Kimetsu no Yaiba, Demon Slayer: ...","Inosuke Hashibira : Yoshitsugu Matsuoka, Nezuk...","Koyoharu Gotouge : Original Creator, Haruo Sot...","Клинок, рассекающий демонов: Квартал красных ф...","<div class=""b-text_with_paragraphs"">Продолжени...",https://shikimori.one/system/animes/original/4...,https://shikimori.one/animes/47778-kimetsu-no-...,demon_slayer,8.82
2,Fruits Basket the Final Season,Fruits Basket the Final,TV,13.0,TMS Entertainment,Spring,"Drama, Fantasy, Romance, Shoujo, Animal Transf...",4.6,2021.0,,...,"Fruits Basket, Fruits Basket Another","Fruits Basket 1st Season, Fruits Basket 2nd Se...","Akito Sohma : Maaya Sakamoto, Kyo Sohma : Yuum...","Natsuki Takaya : Original Creator, Yoshihide I...",Корзинка фруктов: Финал,"<div class=""b-text_with_paragraphs"">Сотни лет ...",https://shikimori.one/system/animes/original/4...,https://shikimori.one/animes/42938-fruits-bask...,fruits_basket,9.01


Для получения всех необходимых данных с сайта будем использовать библиотеку [shiki4py](https://github.com/ren3104/Shiki4py).

In [3]:
import asyncio
import logging
import asyncio
from typing import List


import tqdm
from shiki4py import Shikimori
from shiki4py.types import Anime


logging.basicConfig(
    filename="logs/anime.log",
    level=logging.DEBUG
)


async def search(anime_name: str) -> List[Anime]:
    # Клиент без авторизации
    async with Shikimori("APP_NAME") as api:
        animes = await api.animes.show_part(
            search=anime_name
        )
        
    return animes


async def get_anime(anime_id: int) -> Anime:
    async with Shikimori("APP_NAME") as api:
        anime = await api.animes.show_one(
            anime_id
        )
        
    return anime


async def main():
    """Основной async loop для загрузки данных аниме."""

    with tqdm.tqdm(total=len(df)) as pbar:
        for i, anime in df.iterrows():
            if (not pd.isna(anime.Russian_name) and 
                    not pd.isna(anime.Russian_description) and
                    not pd.isna(anime.Image_url) and
                    not pd.isna(anime.Shikimori_url) and
                    not pd.isna(anime.Franchise) and
                    not pd.isna(anime.Shikimori_rating)):
                pbar.update(1)
                continue

            anime_name = anime.Name
            animes = await search(anime_name)

            if len(animes) == 0 or len(animes) > 1:
                pbar.update(1)
                continue

            api_anime = await get_anime(animes[0].id)

            try:
                if pd.isna(anime.Russian_name):
                    df.at[i, 'Russian_name'] = api_anime.russian
                if pd.isna(anime.Russian_description):
                    df.at[i, 'Russian_description'] = api_anime.description_html
                if pd.isna(anime.Image_url):
                    df.at[i, 'Image_url'] = "https://shikimori.one{}".format(api_anime.image.original)
                if pd.isna(anime.Shikimori_url):
                    df.at[i, 'Shikimori_url'] = 'https://shikimori.one{}'.format(api_anime.url)
                if pd.isna(anime.Franchise):
                    df.at[i, 'Franchise'] = api_anime.franchise
                if pd.isna(anime.Shikimori_rating):
                    df.at[i, 'Shikimori_rating'] = api_anime.score
            except:
                logging.error("Error of processing {}".format(anime_name))
                pbar.update(1)
                continue

            logging.info("A processing of anime {} is successful".format(anime_name))

            pbar.update(1)


loop = asyncio.get_event_loop()
loop.create_task(main()) 

<Task pending name='Task-4' coro=<main() running at /var/folders/lm/0j0v2m453ks57mmwdnjfvz700000gn/T/ipykernel_53697/2559358617.py:37>>

 75%|███████▍  | 13834/18495 [2:36:59<3:07:33,  2.41s/it]

In [None]:
df.head(3)

In [None]:
df.to_csv("data/extended_anime.csv")