In [1]:
%load_ext lineapy

  from pandas import MultiIndex, Int64Index


# Recommendation system

DB:
- ID;
- Name;
- Rating;
- Episodes;
- Studio;
- Tags;
- Description;
- Related_Mange;
- Recommendations.

RS:
Tags -> Recommendations (list of animes).

In [2]:
import pandas as pd

In [4]:
path_to_csv = "../../data/01_raw/anime.csv"
df = pd.read_csv(path_to_csv)

indexed_df = df.set_index("Rank")

In [5]:
indexed_df.head(3)

Unnamed: 0_level_0,Name,Japanese_name,Type,Episodes,Studio,Release_season,Tags,Rating,Release_year,End_year,Description,Content_Warning,Related_Mange,Related_anime,Voice_actors,staff
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Demon Slayer: Kimetsu no Yaiba - Entertainment...,Kimetsu no Yaiba: Yuukaku-hen,TV,,ufotable,Fall,"Action, Adventure, Fantasy, Shounen, Demons, H...",4.6,2021.0,,'Tanjiro and his friends accompany the Hashira...,Explicit Violence,Demon Slayer: Kimetsu no Yaiba,"Demon Slayer: Kimetsu no Yaiba, Demon Slayer: ...","Inosuke Hashibira : Yoshitsugu Matsuoka, Nezuk...","Koyoharu Gotouge : Original Creator, Haruo Sot..."
2,Fruits Basket the Final Season,Fruits Basket the Final,TV,13.0,TMS Entertainment,Spring,"Drama, Fantasy, Romance, Shoujo, Animal Transf...",4.6,2021.0,,'The final arc of Fruits Basket.',"Emotional Abuse,, Mature Themes,, Physical Abu...","Fruits Basket, Fruits Basket Another","Fruits Basket 1st Season, Fruits Basket 2nd Se...","Akito Sohma : Maaya Sakamoto, Kyo Sohma : Yuum...","Natsuki Takaya : Original Creator, Yoshihide I..."
3,Mo Dao Zu Shi 3,The Founder of Diabolism 3,Web,12.0,B.C MAY PICTURES,,"Fantasy, Ancient China, Chinese Animation, Cul...",4.58,2021.0,,'The third season of Mo Dao Zu Shi.',,Grandmaster of Demonic Cultivation: Mo Dao Zu ...,"Mo Dao Zu Shi 2, Mo Dao Zu Shi Q","Lan Wangji, Wei Wuxian, Jiang Cheng, Jin Guang...","Mo Xiang Tong Xiu : Original Creator, Xiong Ke..."


In [6]:
len(indexed_df)

18495

In [7]:
indexed_df.isnull().sum()

Name                   0
Japanese_name      10557
Type                   0
Episodes            8994
Studio              6477
Release_season     14379
Tags                 400
Rating              3131
Release_year         383
End_year           15641
Description            4
Related_Mange      10868
Related_anime       8432
Voice_actors        3186
staff               5490
dtype: int64

In [8]:
from typing import List

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


def build_recommendations(
        rs_df: pd.DataFrame, column: str, rec_column_name: str="recommendations") -> pd.DataFrame:
    """Строит рекомендации для каждой строки в Dataframe относительно выбранного столбца путём векторизации его
    текста и нахождения наиболее похожих строк по этому же векторизованному столбцу.

    Args:
        rs_df (pd.Dataframe): 
        column (str): название столбца для построения на нём рекомендаций.
        rec_column_name (str, optional): название столбца, куда будут записаны рекомендации. Defaults to "recommendations".

    Returns:
        pd.Dataframe: Dataframe с сгенерированными рекомендациями в новом столбце.
    """

    target_column = rs_df.copy()[column].dropna()

    vectorizer = CountVectorizer(token_pattern=r"(?u)(\w[\w ]+)")
    vectors = vectorizer.fit_transform(target_column.tolist())

    nbrs = NearestNeighbors(n_neighbors=11).fit(vectors)
    _, indices = nbrs.kneighbors(vectors)
    recommendations = pd.Series(indices.tolist())
    recommendations.name = rec_column_name

    def fix_indices(indices: List[int]) -> List[int]:
        """Исправляет рекомендации, удаляя индекс того, кому эти рекомендации принадлежат
        (каждая рекомендация в строке ссылается на себя же).

        Args:
            indices (List[int]): список индексов-рекомендаций.

        Returns:
            List[int]: исправленный список индексов-рекомендаций.
        """

        return list(map(lambda idx: target_column.index[idx], indices))[1:]

    recommendations = recommendations.apply(fix_indices)
    recommendations.index = target_column.index

    final_df = rs_df.join(recommendations)

    return final_df


In [9]:
column = 'Tags'

final_df = build_recommendations(indexed_df, column=column)
final_df.index.name = "_id"

In [10]:
final_df.head(3)

Unnamed: 0_level_0,Name,Japanese_name,Type,Episodes,Studio,Release_season,Tags,Rating,Release_year,End_year,Description,Content_Warning,Related_Mange,Related_anime,Voice_actors,staff,recommendations
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Demon Slayer: Kimetsu no Yaiba - Entertainment...,Kimetsu no Yaiba: Yuukaku-hen,TV,,ufotable,Fall,"Action, Adventure, Fantasy, Shounen, Demons, H...",4.6,2021.0,,'Tanjiro and his friends accompany the Hashira...,Explicit Violence,Demon Slayer: Kimetsu no Yaiba,"Demon Slayer: Kimetsu no Yaiba, Demon Slayer: ...","Inosuke Hashibira : Yoshitsugu Matsuoka, Nezuk...","Koyoharu Gotouge : Original Creator, Haruo Sot...","[13, 15011, 3062, 6269, 4422, 7592, 8325, 1013..."
2,Fruits Basket the Final Season,Fruits Basket the Final,TV,13.0,TMS Entertainment,Spring,"Drama, Fantasy, Romance, Shoujo, Animal Transf...",4.6,2021.0,,'The final arc of Fruits Basket.',"Emotional Abuse,, Mature Themes,, Physical Abu...","Fruits Basket, Fruits Basket Another","Fruits Basket 1st Season, Fruits Basket 2nd Se...","Akito Sohma : Maaya Sakamoto, Kyo Sohma : Yuum...","Natsuki Takaya : Original Creator, Yoshihide I...","[27, 108, 911, 11948, 5726, 2540, 14694, 4238,..."
3,Mo Dao Zu Shi 3,The Founder of Diabolism 3,Web,12.0,B.C MAY PICTURES,,"Fantasy, Ancient China, Chinese Animation, Cul...",4.58,2021.0,,'The third season of Mo Dao Zu Shi.',,Grandmaster of Demonic Cultivation: Mo Dao Zu ...,"Mo Dao Zu Shi 2, Mo Dao Zu Shi Q","Lan Wangji, Wei Wuxian, Jiang Cheng, Jin Guang...","Mo Xiang Tong Xiu : Original Creator, Xiong Ke...","[3205, 2181, 2235, 3722, 2105, 2379, 2678, 682..."


In [24]:
out_path = "df_with_recommendations.csv"
s = final_df.to_csv(out_path)

In [26]:
lineapy.save(indexed_df, "indexed_df")
lineapy.save(final_df, "recommendations")
lineapy.to_pipeline(
    pipeline_name="anime_pipeline",
    artifacts=["recommendations", "indexed_df"],
    dependencies={
        "recommendations": {"indexed_df"}
    },
    input_parameters=["path_to_csv", "column"],
    output_dir="scripts",
    framework="SCRIPT"
)

Generated module file: scripts/anime_pipeline_module.py                         
Generated requirements file: scripts/anime_pipeline_requirements.txt            
Generated Docker file: scripts/anime_pipeline_Dockerfile                        


PosixPath('scripts')