<a href="https://colab.research.google.com/github/ArushiG11/Anime-Data-Analysis/blob/main/Anime_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create a Comprehensive Dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [11]:
mal_url = 'https://raw.githubusercontent.com/ArushiG11/Anime-Data-Analysis/main/MyAnimeList.csv'
df_mal = pd.read_csv(mal_url)
# to remove first column
df_mal = df_mal.drop("Unnamed: 0", axis=1)

In [None]:
def clean_mal_data(df_mal):
    """Clean and preprocess MyAnimeList data"""
    df_mal_clean = df_mal.copy()

    # Clean episode numbers
    df_mal_clean['Number of Episodes'] = df_mal_clean['Number of Episodes'].replace('Unknown', '0')
    df_mal_clean['Number of Episodes'] = pd.to_numeric(df_mal_clean['Number of Episodes'], errors='coerce')

    # Extract year from Aired column
    def extract_year(aired_str):
        try:
            if pd.isna(aired_str) or aired_str == 'N/A':
                return np.nan
            # Extract first year mentioned
            year = aired_str.split(',')[0].split(' ')[-1]
            return int(year) if year.isdigit() else np.nan
        except:
            return np.nan

    df_mal_clean['Year'] = df_mal_clean['Aired'].apply(extract_year)

    # Clean Members column
    df_mal_clean['Members'] = df_mal_clean['Members'].str.replace(',', '').astype(float)

    return df_mal_clean

In [None]:
as_url = 'https://raw.githubusercontent.com/ArushiG11/Anime-Data-Analysis/main/aniSearch.csv'
# Load the CSV data into a DataFrame
df_as = pd.read_csv(as_url)
df_as = df_as.drop("Unnamed: 0", axis=1)

In [8]:
df_as.head()

Unnamed: 0.1,Unnamed: 0,Rank,Title,Year,Number of Episodes,Genre,Anime Type,Production House,Ratings,Comments
0,0,1,Frieren: Beyond Journey’s End,2023,28,Adventure,TV-Series,MADHOUSE Inc.,4.54,Masterpiece
1,1,2,Fullmetal Alchemist: Brotherhood,2009,64,Adventure,TV-Series,BONES Inc.,4.53,Masterpiece
2,2,3,Your Name.,2016,1,Sentimental Drama,Movie,CoMix Wave Films Inc.,4.51,Masterpiece
3,3,4,The Apothecary Diaries,2023,24,Drama,TV-Series,"OLM, Inc., TOHO animation STUDIO Co., Ltd.",4.5,Masterpiece
4,4,5,Code Geass: Lelouch of the Rebellion R2,2008,25,Action Drama,TV-Series,SUNRISE Inc.,4.5,Masterpiece


In [9]:
df_as.dtypes

Unnamed: 0,0
Unnamed: 0,int64
Rank,int64
Title,object
Year,object
Number of Episodes,object
Genre,object
Anime Type,object
Production House,object
Ratings,float64
Comments,object


In [None]:
def clean_anisearch_data(df_as):
    """Clean and preprocess AniSearch data"""
    df_as_clean = df_as.copy()

    # Convert episodes to numeric
    df_as_clean['Number of Episodes'] = pd.to_numeric(df_as_clean['Number of Episodes'], errors='coerce')

    # Clean year
    df_as_clean['Year'] = pd.to_numeric(df_as_clean['Year'], errors='coerce')

    # Standardize anime type
    df_as_clean['Anime Type'] = df_as_clean['Anime Type'].str.upper()

    return df_as_clean

In [18]:
boxof_url = 'https://raw.githubusercontent.com/ArushiG11/Anime-Data-Analysis/main/raw_boxof.csv'
# Load the CSV data into a DataFrame
df_boxof = pd.read_csv(boxof_url)
df_boxof = df_boxof.drop("Unnamed: 0", axis=1)

In [19]:
df_boxof.head()

Unnamed: 0,Rank,Title,Lifetime Gross,Max Theaters,Opening,Open Th,Release Date,Distributor
0,1,Pokémon: The First Movie - Mewtwo Strikes Back,"$85,744,662",3043,"$31,036,678",3043,"Nov 10, 1999",Warner Bros.
1,2,Demon Slayer: Kimetsu no Yaiba - The Movie: Mu...,"$49,505,008",2087,"$21,234,994",1614,"Apr 23, 2021",FUNimation Entertainment
2,3,Pokémon the Movie 2000,"$43,758,684",2752,"$19,575,608",2752,"Jul 21, 2000",Warner Bros.
3,4,Dragon Ball Super: Super Hero,"$38,112,140",3018,"$21,126,919",3018,"Aug 19, 2022",Crunchyroll
4,5,Jujutsu Kaisen 0,"$34,542,754",2418,"$18,009,921",2286,"Mar 18, 2022",Crunchyroll


In [20]:
df_boxof.dtypes

Unnamed: 0,0
Rank,int64
Title,object
Lifetime Gross,object
Max Theaters,object
Opening,object
Open Th,object
Release Date,object
Distributor,object


In [None]:
def clean_bofoffice_data(df_boxof):
    """Clean and preprocess BoxOffice data"""
    df_boxof_clean = df_boxof.copy()
    df_boxof_clean['Max Theaters'] = pd.to_numeric(df_boxof_clean['Max Theaters'], errors='coerce')
    df_boxof_clean['Release Data'] = pd.to_datetime(df_boxof_clean['Release Data'], errors='coerce')
    for col in ['Lifetime Gross', 'Opening']:
        df_boxof_clean[col] = df_boxof_clean[col].str.replace('$', '').str.replace(',', '').astype(float)

    return df_boxof_clean