# Anime Analysis

- Anime Industry growth forcasting
- Ranking using members, genres
- Rank Licensors, Producers and Studios
- A avg Broadcast time graph
- Number of animes of each type and the ranking according to it.
- The premiere seasonality

In [14]:
import re
import datetime as dt
def clean_aired(data):
    if re.fullmatch(r"[a-zA-Z]{3} [0-9]{4}", data): # Mar 2021
        aired = str(dt.datetime.strptime(data, "%b %Y").date())
    
    elif re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{4}-[0-9]{2}-[0-9]{2}", data): # 1989-06-05 1989-12-05
        aired = None

    elif re.fullmatch(r"([0-9]{4})", data): # 2021
        aired = str(dt.datetime.strptime(data, "%Y").date())

    elif re.fullmatch(r"[0-9]{4} to [0-9]{4}", data):#2006 to 2009
        aired = data.split(' to ')
        vals = []
        for val in aired:
            vals.append(str(dt.datetime.strptime(val, "%Y").date()))
        aired = ','.join(vals)

    elif re.match(r"[a-zA-Z]{3} [0-9]{1,2}, [0-9]{4} to \?", data): # Dec 30, 2016 to ?
        aired = str(dt.datetime.strptime(data, "%b %d, %Y to ?").date())

    elif re.match(r"[a-z]", data): # b
        aired = 'none'
    
    elif re.fullmatch(r"[a-zA-Z]{3} [0-9]{1,2}, [0-9]{4} to [a-zA-Z]{3} [0-9]{4}", data): # Jan 26, 1990 to Oct 1998
        aired = data.split(' to ')
        start_date = str(dt.datetime.strptime(aired[0], "%b %d, %Y").date())
        end_date = str(dt.datetime.strptime(aired[1], "%b %Y").date())
        aired = ','.join([start_date, end_date])
    
    elif re.match(r"[a-zA-Z]{3} [0-9]{1,2}, [0-9]{4} to [0-9]{4}", data):#Feb 11, 2001 to 2001
        aired = data.split(' to ')
        start_date = str(dt.datetime.strptime(aired[0], "%b %d, %Y").date())
        end_date = str(dt.datetime.strptime(aired[1], "%Y").date())
        aired = ','.join([start_date, end_date])
    else:
        return data

    return aired

In [15]:
def clean_broadcast(data):
    days = {
            "Mondays": 'mon', 
            "Tuesdays": 'tue', 
            "Wednesdays": 'wed',
            "Thursdays": 'thur', 
            "Fridays": 'fri', 
            "Saturdays": 'sat', 
            "Sundays": 'sun'
    }
    if re.match(r"[a-zA-Z]* at [0-9]{2}:[0-9]{2} \(JST\)", data):
        broadcast = data.split(' ')
        weekday = days[broadcast[0]]
        time = dt.datetime.strptime(broadcast[2], '%H:%M')
        broadcast = str(weekday) + ',' + str(time.time())

    elif data == 'None' or data == 'Unknown' or data == 'Not scheduled once per week' or re.fullmatch(r"[a-z]",data):
        broadcast = None
    
    elif re.match(r"[0-9],? ?\d*:\d*:\d*", data):# 5 01:40:00
        broadcast = None
    
    elif re.match(r"[a-zA-Z]* at Unknown", data):# Saturdays at Unknown
        broadcast = data.split(' at ')
        broadcast = days[broadcast[0]]
    else:
        return data

    return broadcast

In [16]:
def clean_data(data):
    data = data.copy()
    data['Aired'] = clean_aired(data['Aired'])
    data['Broadcast'] = clean_broadcast(data['Broadcast'])
    data['Studios'] = data['Studios'].replace('None found,,add some', 'none')
    return data

In [17]:
import pandas as pd
data = pd.read_csv('animedb.csv')
clean_data = data.apply(clean_data, axis='columns')

In [18]:
clean_data = clean_data.dropna().reset_index(drop=True)
clean_data.head()

Unnamed: 0,Name,Aired,Broadcast,Duration,Status,Type,Premiered,Episodes,Rating,Genres,Licensors,Producers,Studios,Members,Favorites
0,Kiku-chan to Ookami,2008-08-13,none,00:45:00,Finished Airing,Special,none,1,PG,Adventure,none,none,Shin-Ei Animation,543,0
1,Cornelis,2008-01-01,none,00:04:00,Finished Airing,Movie,none,1,G,Comedy,none,none,none,492,0
2,Joleobban,"Dec 29, 2016",none,01:21:00,Finished Airing,Movie,none,1,R+,"Drama,Romance",none,none,Studio Dadashow,1638,1
3,Sugio: Mori de Koi wo Shite,"Sep 27, 2015",none,00:04:00,Finished Airing,ONA,none,3,G,Comedy,none,none,DLE,497,0
4,Chirico,2008-04-01,none,00:04:00,Finished Airing,Movie,none,1,PG 13,Avant Garde,none,none,none,743,0


In [19]:
def sort_splits(data,colnames):
    output = {
        colnames[0]: [],
        colnames[1]: [],
    }
    for val in data:
        if len(val) < 2:
            val.append('none')
        output[colnames[0]].append(val[0])
        output[colnames[1]].append(val[1])

    return output

def split_data(data, cols):
    data = data.copy()
    output = []
    for col in cols.keys():
        split = data[col].str.split(cols[col][0])
        sort = sort_splits(split, cols[col][1])
        output.append(sort)

    return output

In [20]:
split_cols = {
    'Aired': [',', ['aired_start', 'aired_end']],
    'Broadcast': [',', ['broadcast_day', 'broadcast_time']],
    'Premiered': [' ', ['premiered_season', 'premiered_year']]
}
splits = split_data(clean_data, split_cols)
splits = {key: value for dicts in splits for key, value in dicts.items()}
splits = pd.DataFrame(splits)
splits = clean_data.drop(columns=split_cols.keys()).join(splits)
splits.head()

Unnamed: 0,Name,Duration,Status,Type,Episodes,Rating,Genres,Licensors,Producers,Studios,Members,Favorites,aired_start,aired_end,broadcast_day,broadcast_time,premiered_season,premiered_year
0,Kiku-chan to Ookami,00:45:00,Finished Airing,Special,1,PG,Adventure,none,none,Shin-Ei Animation,543,0,2008-08-13,none,none,none,none,none
1,Cornelis,00:04:00,Finished Airing,Movie,1,G,Comedy,none,none,none,492,0,2008-01-01,none,none,none,none,none
2,Joleobban,01:21:00,Finished Airing,Movie,1,R+,"Drama,Romance",none,none,Studio Dadashow,1638,1,Dec 29,2016,none,none,none,none
3,Sugio: Mori de Koi wo Shite,00:04:00,Finished Airing,ONA,3,G,Comedy,none,none,DLE,497,0,Sep 27,2015,none,none,none,none
4,Chirico,00:04:00,Finished Airing,Movie,1,PG 13,Avant Garde,none,none,none,743,0,2008-04-01,none,none,none,none,none


In [21]:
def get_cols_for_oh(data):
    output = {}
    data_col = data.columns
    for col in data_col:
        oh_cols = list(set(data[col].sum()))
        # oh_cols_prefix = [col.lower()+'_'+c for c in oh_cols]
        output[col] = oh_cols

    return output

In [22]:
def one_hot_cols(data, cols):
    output = []
    for col in cols.keys():
        col_data = data[col]
        col_oh = pd.DataFrame(columns=cols[col])
        for value in col_data:
            encoding = []
            for colname_oh in col_oh.columns:
                if colname_oh in value:
                    encoding.append(1)
                else:
                    encoding.append(0)
            col_oh.loc[len(col_oh.index)] = encoding
        output.append(col_oh)
    return output

In [23]:
# oh --> One Hot Encode
oh = ['Genres']
to_oh = splits[oh].apply(lambda serie: serie.str.split(','), axis='index')
oh_cols = get_cols_for_oh(to_oh)
oh_encoded_data = one_hot_cols(to_oh, oh_cols)

In [None]:
def aggregate_oh(oh_data, mul):
    summation = oh_data.sum()
    mutiplication = oh_data.multiply(mul, axis=0).sum()

    output = pd.DataFrame(index=summation.index, columns=['Anime', 'Members'])
    output['Anime'] = summation
    output['Members'] = mutiplication

    return output

In [None]:
for index in range(len(oh_cols.keys())):
    agg = aggregate_oh(oh_encoded_data[index], splits['Members'])
    name = list(oh_cols.keys())[index]
    agg.to_csv(f"{name.lower()}.csv")
    

In [None]:
# cleaned_csv = splits.drop(columns=list(oh_cols.keys()))
splits.to_csv('cleaned.csv')