## DATA CLEANING PROCESS 

### Import Libraries

In [1]:
using CSV 
using Plots
backend(:plotlyjs)
using Impute
using MLBase
using PlotlyJS
using Statistics
using StatsPlots
using DataFrames
using MLDataUtils
using ScikitLearn
using DecisionTree

### Import Dataset

In [2]:
anime = CSV.read("C:/Users/Akunna Anyamkpa/Downloads/Anime/top250_anime.csv",DataFrame)

Row,Rank,Title,Popularity,Genre,Studio,Type,Episodes,Duration,Start_date,End_date,Members,Score
Unnamed: 0_level_1,Int64,String,Int64,String?,String?,String7,Int64?,Int64,String7,String7?,String15?,Float64
1,1,Shingeki no Kyojin: The Final Season - Kanketsu-hen,609,"Action, Drama, Suspense",MAPPA,Special,2,61,Mar-23,2023,347875,9.17
2,2,Fullmetal Alchemist: Brotherhood,3,"Action, Adventure, Drama, Fantasy",Bones,TV,64,24,Apr-09,Jul-10,3109951,9.11
3,3,Bleach: Sennen Kessen-hen,508,"Action, Adventure, Fantasy",Pierrot,TV,13,24,Oct-22,Dec-22,404036,9.09
4,4,Steins;Gate,13,"Drama, Sci-Fi, Suspense",White Fox,TV,24,24,Apr-11,Sep-11,2393767,9.08
5,5,Gintama°,335,"Action, Comedy, Sci-Fi",Bandai Namco Pictures,TV,51,24,Apr-15,Mar-16,581994,9.07
6,6,Kaguya-sama wa Kokurasetai: Ultra Romantic,221,"Comedy, Romance",A-1 Pictures,TV,13,23,Apr-22,Jun-22,772019,9.06
7,7,Shingeki no Kyojin Season 3 Part 2,26,"Action, Drama",Wit Studio,TV,10,23,Apr-19,Jul-19,2045115,9.06
8,8,Gintama: The Final,1581,"Action, Comedy, Drama, Sci-Fi",Bandai Namco Pictures,Movie,1,104,Jan-21,Jan-21,126411,9.05
9,9,Gintama',385,"Action, Comedy, Sci-Fi",Sunrise,TV,51,24,Apr-11,Mar-12,513321,9.04
10,10,Hunter x Hunter (2011),10,"Action, Adventure, Fantasy",Madhouse,TV,148,23,Oct-11,Sep-14,2596435,9.04


### Check total number of rows and columns

In [3]:
size(anime)

(250, 12)

### Get a list of all the column names

In [4]:
names(anime)

12-element Vector{String}:
 "Rank"
 "Title"
 "Popularity"
 "Genre"
 "Studio"
 "Type"
 "Episodes"
 "Duration"
 "Start_date"
 "End_date"
 "Members"
 "Score"

### Observe Data Profiling

In [5]:
describe(anime)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,Rank,125.5,1,125.5,250,0,Int64
2,Title,,3-gatsu no Lion,,Zoku Owarimonogatari,0,String
3,Popularity,855.46,1,490.0,5794,0,Int64
4,Genre,,"Action, Award Winning, Drama, Sci-Fi",,"Supernatural, Suspense",1,"Union{Missing, String}"
5,Studio,,8bit,,ufotable,2,"Union{Missing, String}"
6,Type,,Movie,,TV,0,String7
7,Episodes,17.7912,1,12.0,201,1,"Union{Missing, Int64}"
8,Duration,41.72,3,24.0,161,0,Int64
9,Start_date,,#VALUE!,,Sep-22,0,String7
10,End_date,,#VALUE!,,Sep-99,2,"Union{Missing, String7}"


### Sum the total number of missing values

In [6]:
# count the total number of missing values in the DataFrame
n_missing = sum.(ismissing, eachcol(anime))

# print the result
println("Total number of missing values: ", sum(n_missing))

Total number of missing values: 8


### Drop rows with missing values

In [7]:
dropmissing!(anime)
first(anime)

Row,Rank,Title,Popularity,Genre,Studio,Type,Episodes,Duration,Start_date,End_date,Members,Score
Unnamed: 0_level_1,Int64,String,Int64,String,String,String7,Int64,Int64,String7,String7,String15,Float64
1,1,Shingeki no Kyojin: The Final Season - Kanketsu-hen,609,"Action, Drama, Suspense",MAPPA,Special,2,61,Mar-23,2023,347875,9.17


In [8]:
size(anime)

(244, 12)

### Re-Sum the total number of missing values

In [9]:
# count the total number of missing values in the DataFrame
n_missing = sum.(ismissing, eachcol(anime))

# print the result
println("Total number of missing values: ", sum(n_missing))

Total number of missing values: 0


## FINDING TRENDS AND ANSWERING QUESTIONS

### What is the most popular anime in the market?

In [17]:
# Sort the anime dataset by popularity and extract the top 5 rows
chart_1 = sort!(anime, :Popularity, rev=true)

# get the first 5 rows
top_5_popular = first(chart_1, 5)

# Create the scatter chart using the top 5 anime data
one = PlotlyJS.plot(PlotlyJS.scatter(top_5_popular, x= top_5_popular.Popularity, y= top_5_popular.Title))
two = PlotlyJS.plot(PlotlyJS.scatter(top_5_popular, x= top_5_popular.Popularity, y= top_5_popular.Genre))

popularity_charts = [one; two]
relayout!(popularity_charts, title_text="Title and Genre by Popularity")
popularity_charts

### The most time consuming anime by duration and episodes?

In [18]:
# Sort the anime dataset by episodes and duration
chart_2 = sort!(anime, :Episodes, rev=true)
chart_3 = sort!(anime, :Duration, rev=true)

# get the first 5 rows
top_5_episodes = first(chart_2, 5)
top_5_duration = first(chart_3, 5)

# Create the bar chart using the top 5 anime data
three = PlotlyJS.plot(PlotlyJS.bar(top_5_episodes, y=top_5_episodes.Title, x=top_5_episodes.Episodes, orientation="h"))
four = PlotlyJS.plot(PlotlyJS.bar(top_5_duration, y=top_5_duration.Title, x=top_5_duration.Duration, orientation="h"))

duration_episodes_charts = [three; four]
relayout!(duration_episodes_charts, title_text="Play Time by Episodes and Duration")
duration_episodes_charts

### Months with the highest viewership

In [19]:
# Sort the anime dataset by members
chart_4 = sort!(anime, :Members, rev=true)

# get the first 5 rows
top_5_members = first(chart_4, 7)

# Create the scatter chart
five = PlotlyJS.plot(PlotlyJS.bar(top_5_members, x= top_5_members.Members, y= top_5_members.Start_date))

### Anime with the best Rating Scores

In [21]:
# Sort the anime dataset by score and extract the top 5 rows
chart_5 = sort!(anime, :Score, rev=true)

# get the first 5 rows
top_5_score = first(chart_5, 5)

# Create the bar chart using the top 10 anime data
six = PlotlyJS.plot(PlotlyJS.scatter(top_5_score, y=top_5_score.Title, x=top_5_score.Score, orientation="h"))

# Display the bar chart
display(six)

### Members by Title, Genre, Studio and Type?

In [31]:
# Sort the anime dataset by Members and extract the top 10 rows
chart_6 = sort!(anime, :Members, rev=true)

# get the first rows
top_members_title = first(chart_6, 5)
top_members_genre = first(chart_6, 5)

# Create the bar chart using the top 10 anime data
seven = PlotlyJS.plot(PlotlyJS.bar(top_members_title, y=top_members_title.Title, x=top_members_title.Members, orientation="h"))
eight = PlotlyJS.plot(PlotlyJS.scatter(top_members_genre, y=top_members_genre.Genre, x=top_members_genre.Members))

p = [seven; eight]
relayout!(p, title_text="Multiple Subplots with Titles")
p

# Display the bar chart
#display(bar_chart_three) 

In [32]:
# get the first 10 rows
top_5_members_studio = first(chart_6, 5)
top_5_members_type = first(chart_6, 5)

# Create the bar chart using the top 10 anime data
nine = PlotlyJS.plot(PlotlyJS.scatter(top_5_members_studio, y=top_5_members_studio.Studio, x=top_5_members_studio.Members))
ten = PlotlyJS.plot(PlotlyJS.scatter(top_5_members_type, y=top_5_members_type.Type, x=top_5_members_type.Members))

p = [nine; ten]
relayout!(p, title_text="Multiple Subplots with Titles")
p

# Display the bar chart
#display(bar_chart_three) 