Clean anime data from my animelist raw dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
anime_df = pd.read_csv("my_anime_data_cleaned/anime_data.csv", sep=",")

In [3]:
anime_df.to_csv("my_anime_data_cleaned/anime_data.csv", sep=",", index=True, index_label="anime_index")

In [45]:
anime_df.columns

Index(['anime_id', 'type', 'source_type', 'num_episodes', 'status',
       'start_date', 'end_date', 'season', 'genres', 'score', 'score_count',
       'score_rank', 'popularity_rank', 'members_count', 'favorites_count',
       'watching_count', 'completed_count', 'on_hold_count', 'dropped_count',
       'plan_to_watch_count', 'total_count'],
      dtype='object')

In [25]:
# dropping some columns
unnecessary_columns = ["anime_url", "title", "synopsis", "main_pic", "studios", "pics", "clubs"]
anime_df.drop(columns=unnecessary_columns, inplace=True)

In [44]:
anime_df.drop(columns=[f"score_10_count"], inplace=True)

In [6]:
all_genres = set()
for index, row in anime_df.iterrows() :
    genre = row["genres"]

    genres = str(genre).split("|")

    for genre in genres :
        all_genres.add(genre)

In [8]:
all_genres = list(all_genres)

In [None]:
all_genres

In [13]:
statuts = anime_df["status"].unique()

In [14]:
statuts

array(['Finished Airing', 'Currently Airing', 'Not yet aired'],
      dtype=object)

In [21]:
seasons = ["Winter", "Summer", "Fall", "Spring"]

In [28]:
types = anime_df["type"].unique()

array(['TV', 'ONA', 'OVA', 'Movie', 'Special'], dtype=object)

In [30]:
source_types = anime_df["source_type"].unique()

In [63]:
one_hot_params = {
    "status" : statuts,
    "season" : seasons,
    "type" : types,
    "source_type" : source_types,
    "genres" : all_genres,
}

for key in one_hot_params :
    value_dict = {}
    for entry in one_hot_params[key] :
        if key == "genres" :
            name = "g_" + str(entry)
        else :
            name = entry
             
        value_dict[name] = 0

    one_hot_params[key] = value_dict

In [42]:
anime_df[~anime_df["score"].isna()].iloc[0:10][["anime_id", "score"]]

Unnamed: 0,anime_id,score
686,41392,6.5
687,30524,6.2
688,10611,6.39
689,24,7.82
690,42203,8.46
691,23273,8.62
692,36425,7.24
693,1892,7.25
694,10346,6.78
695,6067,6.76


In [46]:
new_colums = list(anime_df.columns)
columns_to_remove = ["type", "source_type", "status", "season", "genres"]
new_colums

['anime_id',
 'type',
 'source_type',
 'num_episodes',
 'status',
 'start_date',
 'end_date',
 'season',
 'genres',
 'score',
 'score_count',
 'score_rank',
 'popularity_rank',
 'members_count',
 'favorites_count',
 'watching_count',
 'completed_count',
 'on_hold_count',
 'dropped_count',
 'plan_to_watch_count',
 'total_count']

In [89]:
def handle_genres(genres_str:str) -> dict :
    genres = genres_str.split("|")
    values = dict(one_hot_params["genres"])

    for genre in genres :
        genre = "g_" + genre

        if genre in values :
            values[genre] = 1

    return values

def handle_saison(saison_str:str) -> dict :
    saison = str(saison_str).split(" ")
    saison = saison[0]

    values = dict(one_hot_params["season"])

    if saison in values :
        values[saison] = 1

    return values

def handle_one_hot(col_name, valeur) -> dict:
    values = dict(one_hot_params[col_name])

    if valeur in values :
        values[valeur] = 1

    return values

In [90]:
def parse_row(row) -> dict :
    dict_data = {}
    for col in anime_df.columns :
        if col in one_hot_params :
            if col == "genres" :
                valeur = handle_genres(row[col])

            elif col == "season" :
                valeur = handle_saison(row[col])
            
            else :
                valeur = handle_one_hot(col, row[col])

            for key, value in valeur.items() :
                dict_data[key] = value
            
            continue

        dict_data[col] = row[col]

    return dict_data

In [91]:
import csv
file_path = "my_anime_data_cleaned/anime_data.csv"
with open(file_path, "w", encoding="utf-8") as f :
    writer = csv.DictWriter(f, fieldnames=new_colums, dialect="excel")
    writer.writeheader()
    
    for i, row in anime_df.iterrows() :
        infos = parse_row(row)

        writer.writerow(infos)

In [86]:
anime_df[anime_df["anime_id"] == 47164]

Unnamed: 0,anime_id,type,source_type,num_episodes,status,start_date,end_date,season,genres,score,...,score_rank,popularity_rank,members_count,favorites_count,watching_count,completed_count,on_hold_count,dropped_count,plan_to_watch_count,total_count
12,47164,TV,Light novel,,Not yet aired,2022-03-20 00:00:00,,,Action|Adventure|Comedy|Fantasy|Romance,,...,,1710,99913,383,0,0,0,0,99909,99909
