In [1]:
# imports
import pandas as pd
import numpy as np
import re

In [2]:
# reading original datasets

anime = pd.read_csv('../archive/anime-dataset-2023.csv')

In [3]:
# creating necessary functions

def extract_first_year(aired_string):
    if pd.isna(aired_string) or aired_string == 'UNKNOWN':
        return 0
    
    match = re.search(r'\b(\d{4})\b', aired_string)
    if match:
        return int(match.group(1))
    return 0


def parse_duration_to_minutes(duration_str):
    if pd.isna(duration_str) or 'unknown' in str(duration_str).lower():
        return np.nan

    duration_str = str(duration_str).lower().strip()
    if 'sec' in duration_str:
        return np.nan

    total_minutes = 0
    hr_match = re.search(r'(\d+)\s*hr', duration_str)
    if hr_match:
        total_minutes += int(hr_match.group(1)) * 60
    min_match = re.search(r'(\d+)\s*min', duration_str)
    if min_match:
        total_minutes += int(min_match.group(1))
        
    if total_minutes > 0:
        return total_minutes
    return np.nan


def bin_series(series: pd.Series) -> pd.Series:
    num_observations = len(series)

    num_bins = int(np.ceil(np.sqrt(num_observations)))

    binned_series = pd.cut(
        x=series,
        bins=num_bins,
        labels=False, 
        include_lowest=True
    )
    
    return binned_series

In [4]:
anime_attribute_matrix = anime[['anime_id', 'Genres', 'Type', 'Episodes', 'Aired', 'Status', 
    'Producers', 'Licensors', 'Studios', 'Duration', 'Rating']].set_index('anime_id').copy()


# Create 'first_aired' column
anime_attribute_matrix['first_aired'] = anime_attribute_matrix['Aired'].map(extract_first_year)
anime_attribute_matrix['first_aired'] = pd.to_numeric(anime_attribute_matrix['first_aired'], errors='coerce')
anime_attribute_matrix['first_aired'] = anime_attribute_matrix['first_aired'].fillna(0).astype(int)

# Create 'watchtime' column
anime_attribute_matrix['watchtime'] = (anime_attribute_matrix['Duration'].map(parse_duration_to_minutes).astype('Int64') * anime_attribute_matrix['Episodes'].replace('UNKNOWN', np.nan).astype('float')).astype('Int64')
anime_attribute_matrix['watchtime'] = bin_series(anime_attribute_matrix['watchtime'])

# ONE HOT ENCODING
anime_attribute_matrix = pd.get_dummies(anime_attribute_matrix, columns=['Type'], prefix='Type', dtype=int)
anime_attribute_matrix = pd.get_dummies(anime_attribute_matrix, columns=['Status'], prefix='Status', dtype=int)
anime_attribute_matrix = pd.get_dummies(anime_attribute_matrix, columns=['Rating'], prefix='Rating', dtype=int)
anime_attribute_matrix = pd.get_dummies(anime_attribute_matrix, columns=['first_aired'], prefix='first_aired', dtype=int)
anime_attribute_matrix = pd.get_dummies(anime_attribute_matrix, columns=['watchtime'], prefix='watchtime', dtype=int)


# MULTI HOT ENCODING
anime_attribute_matrix = pd.concat([anime_attribute_matrix, anime_attribute_matrix['Genres'].str.get_dummies(sep=',').astype(int).add_prefix('Genre_')], axis=1)
anime_attribute_matrix = pd.concat([anime_attribute_matrix, anime_attribute_matrix['Producers'].str.get_dummies(sep=',').astype(int).add_prefix('Producers_')], axis=1)
anime_attribute_matrix = pd.concat([anime_attribute_matrix, anime_attribute_matrix['Licensors'].str.get_dummies(sep=',').astype(int).add_prefix('Licensors_')], axis=1)
anime_attribute_matrix = pd.concat([anime_attribute_matrix, anime_attribute_matrix['Studios'].str.get_dummies(sep=',').astype(int).add_prefix('Studios_')], axis=1)

# CLEANUP 
anime_attribute_matrix = anime_attribute_matrix.drop(['Genres', 'Producers', 'Licensors', 'Studios', 'Aired', 'Episodes', 'Duration'], axis=1)
anime_attribute_matrix = anime_attribute_matrix.drop(anime_attribute_matrix.filter(regex='(?i)unknown$').columns.to_list(), axis=1)

In [5]:
anime_attribute_matrix.to_csv('../data/Anime_processed.csv')