<h3 style="color: #7aa2f7; font-weight: bold;">Import Libraries</h3>

In [1]:
from library import pd_read_csv, pd_dataframe
from library import CsvSniffer, pd_series


from core import rr_unique, rr_display, rr_str_series, rr_clean_alt_list, rr_to_series, rr_remove_none

<h3 style="color: #7aa2f7; font-weight: bold;">About Dataset</h3>



In [2]:
# know the delimiter of the csv file

dataset_mal_top2000_anime = open("dataset/mal_top2000_anime.csv", 'r')
dataset_mal_top2000_anime_first_row = dataset_mal_top2000_anime.readline(1)

sniffer = CsvSniffer()
dataset_first_row = sniffer.sniff(dataset_mal_top2000_anime_first_row)
dataset_mal_top2000_anime_delimiter = dataset_first_row.delimiter
print(f"`dataset/mal_top2000_anime.csv` delimiter: {dataset_mal_top2000_anime_delimiter}")

`dataset/mal_top2000_anime.csv` delimiter: ,


<div style="
  color: #9aa5ce;
  font-family: Fira Code;
  font-weight: 500;
">
<ul>
<li>Dataset Info</li>
  <ul>
  <li>Column value = None, if the value is not assigned</li>
  <li>Structured Data</li>
  <li>File type: <code style="color: #89ddff">*.csv</code></li>
  <li>Delimiter: <code style="color: #89ddff">,</code></li>
  </ul>
</ul>

<h3 style="color: #7aa2f7; font-weight: bold;">Dataset</h3>

In [3]:
# Dataset credit
brunobacelardc_myanimelist_top2000_anime = pd_read_csv("dataset/mal_top2000_anime.csv", delimiter = dataset_mal_top2000_anime_delimiter)

# Alias to credited dataset
anime_dataset = brunobacelardc_myanimelist_top2000_anime

<h2 style="color: #7aa2f7; font-weight: bold;" align="center"><u>Preprocessing Dataset</u></h2>

<h3 style="color: #7aa2f7; font-weight: bold;">Data Cleansing</h3>


In [4]:
rr_display(anime_dataset.head(5))

Unnamed: 0.1,Unnamed: 0,Name,Type,Score,Score Rank,Popularity Rank,Air Date,Studio,Num. of episodes,Genres,Theme(s),Demographic
0,0,Fullmetal Alchemist: Brotherhood,TV,9.14,1,3,"Apr 5, 2009 to Jul 4, 2010",['Bones'],64,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],Shounen
1,1,Spy x Family,TV,9.09,2,350,"Apr 9, 2022 to ?","['Wit Studio', ' CloverWorks']",12,"['Action', 'Comedy']",['Childcare'],Shounen
2,2,Shingeki no Kyojin Season 3 Part 2,TV,9.08,3,32,"Apr 29, 2019 to Jul 1, 2019",['Wit Studio'],10,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",Shounen
3,3,Steins;Gate,TV,9.08,4,13,"Apr 6, 2011 to Sep 14, 2011",['White Fox'],24,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",
4,4,Gintama°,TV,9.08,5,335,"Apr 8, 2015 to Mar 30, 2016",['Bandai Namco Pictures'],51,"['Action', 'Comedy', 'Sci-Fi']","['Gag Humor', 'Historical', 'Parody', 'Samurai']",Shounen


<h4 style="color: #7aa2f7; font-weight: bold;">Data Extraction</h4>

<div style="
  color: #9aa5ce;
  font-family: Fira Code;
  font-weight: 500;
">

<p>Extracting only the required data.</p>


<p style="font-size: 20px;">Required Dataset Column</p>

- <code style="color: #7aa2f7;">Types</code>: What the anime's type of media is (e.g. a television anime, a movie, a music video, etcetera)
- <code style="color: #7aa2f7;">Score</code>: The anime's score, calculated using MAL's weighted scoring.
- <code style="color: #7aa2f7;">Score Rank</code>: The anime's rank, based on its score.
- <code style="color: #7aa2f7;">Studio</code>: The studio(s) that made the anime.
- <code style="color: #7aa2f7;">Num. of episodes</code>: How many episodes the anime has.
- <code style="color: #7aa2f7;">Genres</code>: The anime's genres, formatted as a list. An entry with no genres registered in the website will have a single list with the string 'None' as its value.
- <code style="color: #7aa2f7;">Popularity Rank</code> : The anime's rank, based on its popularity.

</div>

In [5]:
# required data
required_dataset_column = [
    "Name", # Required only to create new csv file
    "Genres",
    "Studio",
    "Type",
    "Score",
    "Score Rank",
    "Num. of episodes",
    "Popularity Rank"
]


anime_dataset = anime_dataset[required_dataset_column]
rr_display(anime_dataset.head())


Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
0,Fullmetal Alchemist: Brotherhood,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Bones'],TV,9.14,1,64,3
1,Spy x Family,"['Action', 'Comedy']","['Wit Studio', ' CloverWorks']",TV,9.09,2,12,350
2,Shingeki no Kyojin Season 3 Part 2,"['Action', 'Drama']",['Wit Studio'],TV,9.08,3,10,32
3,Steins;Gate,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],TV,9.08,4,24,13
4,Gintama°,"['Action', 'Comedy', 'Sci-Fi']",['Bandai Namco Pictures'],TV,9.08,5,51,335


In [6]:
# Extracting unique element in anime_dataset["Studio"]

anime_dataset_studio = list(anime_dataset["Studio"])
anime_dataset_studio_unique = rr_unique(anime_dataset_studio, True, ",")

print(f"Number of Unique Studio: {len(anime_dataset_studio_unique)}\n\n| Unique Studio(s) |\n--------------------")
print(anime_dataset_studio_unique)

Number of Unique Studio: 224

| Unique Studio(s) |
--------------------
['bones', 'wit studio', 'cloverworks', 'white fox', 'bandai namco pictures', 'sunrise', 'madhouse', 'tms entertainment', 'k-factory', 'kitty film mitaka studio', 'a-1 pictures', 'shaft', 'kyoto animation', 'mappa', 'ufotable', 'comix wave films', 'studio lan', 'studio ghibli', 'production i.g', 'pierrot plus', 'studio signpost', 'studio bind', 'olm', 'p.i.c.s.', 'studio deen', 'artland', 'tokyo movie shinsha', 'b.cmay pictures', 'studio pierrot', 'none', 'kinema citrus', 'khara', '"brains base"', 'gainax', 'shuka', 'toei animation', 'tatsunoko production', 'studio chizu', 'david production', 'studio voln', 'j.c.staff', 'c-station', 'lerche', 'shin-ei animation', 'manglobe', 'haoliners animation league', 'hal film maker', 'geno studio', 'egg firm', 'zero-g', 'platinum vision', 'synergysp', 'p.a. works', 'orange', 'nippon animation', 'silver link.', '8bit', 'feel.', 'lidenfilms', 'hololive production', 'topcraft', 's

In [7]:
# Extracting unique element in anime_dataset["Genres"]

anime_dataset_genres = list(anime_dataset["Genres"])
anime_dataset_genres_unique = rr_unique(anime_dataset_genres, True, ",")

print(f"Number of Unique Genres: {len(anime_dataset_genres_unique)}\n\n| Unique Genres |\n--------------------")
print(anime_dataset_genres_unique)

Number of Unique Genres: 20

| Unique Genres |
--------------------
['action', 'adventure', 'drama', 'fantasy', 'comedy', 'sci-fi', 'suspense', 'romance', 'slice of life', 'supernatural', 'mystery', 'award winning', 'sports', 'ecchi', 'none', 'avant garde', 'horror', 'boys love', 'gourmet', 'girls love']


In [8]:
# Extracting unique element in anime_dataset["Type"]

anime_dataset_types = list(anime_dataset["Type"])
anime_dataset_types_unique = rr_unique(anime_dataset_types, True, ",")

print(f"Number of Unique Types: {len(anime_dataset_types_unique)}\n\n| Unique Types |\n--------------------")
(anime_dataset_types_unique)

Number of Unique Types: 6

| Unique Types |
--------------------


['tv', 'movie', 'ova', 'ona', 'special', 'music']

In [9]:

rr_display(anime_dataset.head())

Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
0,Fullmetal Alchemist: Brotherhood,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Bones'],TV,9.14,1,64,3
1,Spy x Family,"['Action', 'Comedy']","['Wit Studio', ' CloverWorks']",TV,9.09,2,12,350
2,Shingeki no Kyojin Season 3 Part 2,"['Action', 'Drama']",['Wit Studio'],TV,9.08,3,10,32
3,Steins;Gate,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],TV,9.08,4,24,13
4,Gintama°,"['Action', 'Comedy', 'Sci-Fi']",['Bandai Namco Pictures'],TV,9.08,5,51,335


In [10]:
# TODO(@Adam-Al-Rahaman): Remove this cell and change it with respectively column
# anime_dataset["column_name"] = anime_dataset["column_name"].apply(eval), if
# overcome the @warning: `SettingWithCopyWarning`,
# @ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
updated_required_data = {
    "Name": anime_dataset["Name"], # Required, because the update dataset will be exported to new csv file as `dataset/rr_anime_info.csv`
    "Genres": rr_str_series(anime_dataset["Genres"], strip_white_space = True, to_lower=True),
    "Studio": rr_str_series(anime_dataset["Studio"], strip_white_space = True, to_lower=True),
    "Type": anime_dataset["Type"].str.lower(),
    "Score": anime_dataset["Score"],
    "Score Rank": anime_dataset["Score Rank"],
    "Num. of episodes": anime_dataset["Num. of episodes"],
    "Popularity Rank": anime_dataset["Popularity Rank"]
}

updated_required_dataset = pd_dataframe(updated_required_data)
rr_display(updated_required_dataset.head())


Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
0,Fullmetal Alchemist: Brotherhood,"['action', 'adventure', 'drama', 'fantasy']",['bones'],tv,9.14,1,64,3
1,Spy x Family,"['action', 'comedy']","['wit studio', 'cloverworks']",tv,9.09,2,12,350
2,Shingeki no Kyojin Season 3 Part 2,"['action', 'drama']",['wit studio'],tv,9.08,3,10,32
3,Steins;Gate,"['drama', 'sci-fi', 'suspense']",['white fox'],tv,9.08,4,24,13
4,Gintama°,"['action', 'comedy', 'sci-fi']",['bandai namco pictures'],tv,9.08,5,51,335


In [11]:
# None value contain in Genres column
dataset_genres_none_val = updated_required_dataset["Genres"].apply(lambda x: 'none' in x)
rr_display(updated_required_dataset[dataset_genres_none_val])

Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
81,Shoujo☆Kageki Revue Starlight Movie,['none'],['kinema citrus'],movie,8.58,82,1,5749
151,Yoru Ni Kakeru,['none'],['none'],music,8.42,152,1,5163
180,Kizumonogatari I: Tekketsu-hen,['none'],['shaft'],movie,8.38,181,1,384
197,Paripi Koumei,['none'],['p.a. works'],tv,8.35,198,12,2074
224,Kawaki wo Ameku,['none'],['none'],music,8.32,225,1,3831
270,Initial D Final Stage,['none'],['synergysp'],tv,8.26,271,4,1554
352,Girls & Panzer: Saishuushou Part 3,['none'],['actas'],movie,8.18,353,1,4494
479,Girls & Panzer: Saishuushou Part 2,['none'],['actas'],movie,8.07,480,1,3625
497,Hololive Alternative,['none'],['hololive production'],ona,8.05,498,1,6423
509,BanG Dream! 3rd Season,['none'],['sanzigen'],tv,8.04,510,13,3476


<h3 style="color: #7aa2f7; font-weight: bold;">Data Enrichment</h3>

<div style="
  color: #9aa5ce;
  font-family: Fira Code;
  font-weight: 500;
">
<li>Removing the which contain null or none value in column `Genres` and `Studio` as it cannot be replace with other values.</li>
<li>Convert the value of column `Genres` and `Studio` to lower case</li>

</div>

In [12]:
# None value contain in Studio column
dataset_studio_none_val = updated_required_dataset["Studio"].apply(lambda x: 'none' in x)
rr_display(updated_required_dataset[dataset_studio_none_val])

Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
51,Kingdom 4th Season,['action'],['none'],tv,8.69,52,0,4180
141,Wu Liuqi Zhi Xuanwu Guo Pian,"['action', 'adventure', 'comedy', 'drama', 'mystery']",['none'],ona,8.44,142,10,3295
151,Yoru Ni Kakeru,['none'],['none'],music,8.42,152,1,5163
224,Kawaki wo Ameku,['none'],['none'],music,8.32,225,1,3831
268,Wu Liuqi Zhi Zui Qiang Fa Xing Shi,"['action', 'comedy', 'drama', 'mystery']",['none'],ona,8.26,269,10,2743
292,Luo Xiao Hei Zhan Ji (Movie),"['adventure', 'drama', 'fantasy']",['none'],movie,8.23,293,1,4663
431,Non Non Biyori Nonstop OVA,"['comedy', 'slice of life']",['none'],ova,8.11,432,1,5570
558,Mushoku Tensei: Isekai Ittara Honki Dasu Special,"['drama', 'fantasy', 'ecchi']",['none'],special,8.01,559,1,1921
591,Wan Sheng Jie 2,"['comedy', 'slice of life', 'supernatural']",['none'],ona,7.98,592,12,6420
611,Wan Sheng Jie,"['comedy', 'slice of life', 'supernatural']",['none'],ona,7.97,612,12,4756


In [13]:
none_val_studio_genres = list(set(updated_required_dataset[dataset_studio_none_val].index.append(updated_required_dataset[dataset_genres_none_val].index)))
print(none_val_studio_genres)

[1546, 1041, 1044, 1559, 1565, 552, 1065, 558, 51, 1591, 1607, 1608, 1097, 591, 81, 1107, 596, 1621, 1627, 611, 1635, 1638, 1126, 621, 623, 1139, 1148, 1661, 1662, 1669, 141, 1677, 1683, 151, 1691, 1183, 677, 1195, 180, 1216, 197, 1734, 1224, 1738, 1759, 224, 1248, 748, 756, 763, 1799, 1290, 268, 270, 1295, 1810, 788, 1309, 292, 1329, 1844, 823, 831, 1862, 847, 1875, 1886, 352, 1380, 1383, 1385, 1387, 1403, 1404, 1927, 903, 907, 1931, 912, 431, 1970, 1460, 954, 1487, 479, 1513, 497, 1522, 1523, 1532, 509]


In [14]:
updated_required_dataset = updated_required_dataset.drop(none_val_studio_genres)
rr_display(updated_required_dataset.head())

Unnamed: 0,Name,Genres,Studio,Type,Score,Score Rank,Num. of episodes,Popularity Rank
0,Fullmetal Alchemist: Brotherhood,"['action', 'adventure', 'drama', 'fantasy']",['bones'],tv,9.14,1,64,3
1,Spy x Family,"['action', 'comedy']","['wit studio', 'cloverworks']",tv,9.09,2,12,350
2,Shingeki no Kyojin Season 3 Part 2,"['action', 'drama']",['wit studio'],tv,9.08,3,10,32
3,Steins;Gate,"['drama', 'sci-fi', 'suspense']",['white fox'],tv,9.08,4,24,13
4,Gintama°,"['action', 'comedy', 'sci-fi']",['bandai namco pictures'],tv,9.08,5,51,335


In [15]:
updated_required_dataset.isnull().sum()

Name                0
Genres              0
Studio              0
Type                0
Score               0
Score Rank          0
Num. of episodes    0
Popularity Rank     0
dtype: int64

<h3 style="color: #7aa2f7; font-weight: bold;">Data Export</h3>


<div style="
  color: #9aa5ce;
  font-family: Fira Code;
  font-weight: 500;
">
Export updated dataset to new file <code style="color:#7aa">rr_anime_info.csv</code> for further use in data validation.
</div>

In [16]:
# export
updated_required_dataset.to_csv("dataset/rr_anime_info.csv", index = False)