In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_parquet("anime_data_selected_featuresv2.parquet")

In [3]:
df.head()

Unnamed: 0,type,num_episodes,status,start_date,end_date,season,studios,genres,score,release_year,studio_rank_score,duration
0,TV,12,Finished Airing,2021-01-04,2021-03-22,Winter 2021,"[LIDENFILMS, Felix Film]","[Adventure, Fantasy, Girls Love, Mystery, Sci-Fi]",6.5,2021,72,77.0
1,TV,13,Finished Airing,2015-10-02,2015-12-25,Fall 2015,[Tomovies],"[Horror, Mystery, Supernatural, Suspense]",6.2,2015,4,84.0
2,TV,12,Finished Airing,2011-07-10,2011-09-25,Summer 2011,[AIC],"[Comedy, Romance, Ecchi, Harem, School]",6.39,2011,144,77.0
3,TV,26,Finished Airing,2004-10-05,2005-03-29,Fall 2004,[Studio Comet],"[Comedy, Romance, School, Shounen]",7.82,2004,35,175.0
4,TV,12,Finished Airing,2021-01-06,2021-03-24,Winter 2021,[White Fox],"[Drama, Fantasy, Suspense, Psychological]",8.46,2021,43,77.0


In [None]:
df['genres'].unique() #cannot do this cz each row has more than one genre and that list might be considered as one unique thing.

In [5]:
# Step 1: Make a copy so you don't mess with your original df
df_genres = df.copy()

# Step 2: Explode the genres
df_genres = df_genres.explode('genres')

# Step 3: Remove any leading/trailing whitespace just in case
df_genres['genres'] = df_genres['genres'].str.strip()

# Step 4: Get all unique genres
unique_genres = df_genres['genres'].unique()

# Step 5: View the genres
print(sorted(unique_genres))
print(f"\nTotal unique genres: {len(unique_genres)}")


['Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love', 'Cars', 'Comedy', 'Demons', 'Drama', 'Ecchi', 'Erotica', 'Fantasy', 'Game', 'Girls Love', 'Gourmet', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Suspense', 'Vampire', 'Work Life']

Total unique genres: 44


**one-hot encode categorical values in 'type'**

In [4]:
# One-hot encode the 'type' column
type_encoded = pd.get_dummies(df['type'], prefix='type')

# Concatenate the new one-hot columns with original DataFrame
df = pd.concat([df, type_encoded], axis=1)

**one-hot encode categorical values in 'status'**

In [5]:
# One-hot encode the 'status' column
status_encoded = pd.get_dummies(df['status'], prefix='status')

# Concatenate the new one-hot columns with original DataFrame
df = pd.concat([df, status_encoded], axis=1)


**one-hot encode categorical values in season after stripping the year because its redundant and present already in release_year**

In [6]:
# Strip the year from season and keep only the season name
df['season_clean'] = df['season'].str.split().str[0]

In [7]:
# One-hot encode the cleaned season
season_encoded = pd.get_dummies(df['season_clean'], prefix='season')


In [8]:
# Add the one-hot encoded season columns to the main DataFrame
df = pd.concat([df, season_encoded], axis=1)


**'studios':**
✅ What we already have (studio_rank_score)
- It's numeric and ready to use.
- It represents the popularity or commonness of the studio(s) involved in each anime.
- If an anime has multiple studios, their frequencies are summed, so higher scores mean studios that appear more often across all animes.
- This is a kind of frequency-based encoding (like count encoding).
So in this sense, yes, you’ve already encoded studios in a meaningful way.
But this does not show the presence of a particular studio. But the number of unique studios are way too large. So encoding that will mean that we have hundreds of additional columns.
So skip that for now.

**multi-encode 'genres'**

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the binarizer
mlb = MultiLabelBinarizer()

# Apply it to the genres column
genres_encoded = pd.DataFrame(mlb.fit_transform(df['genres']), 
                              columns=[f"genre_{g}" for g in mlb.classes_])

# Concatenate with original DataFrame
df = pd.concat([df, genres_encoded], axis=1)


**create new dataset with encoded columns**

In [10]:
# Save as Parquet
df.to_parquet("anime_data_selected_features_encoded_v3.parquet", index=False)

# Save as CSV
df.to_csv("anime_data_selected_features_encoded_v3.csv", index=False)


In [11]:
df1 = pd.read_parquet("anime_data_selected_features_encoded_v3.parquet")

In [14]:
df1.shape

(10649, 70)