# Mineração de Dados
 Extração e tratamento de dados obtidos pela API do MyAnimeList jikan

Importação de Biliotecas


In [63]:
import numpy as np
import pandas as pd
from pandas import json_normalize
from pathlib import Path 
import requests


Extração de dados da API

In [64]:
url = "http://api.jikan.moe/v4/anime"
df =pd.DataFrame()

pages_qtd = 300

for page in range(1,pages_qtd+1):
    res = requests.get(url,{"page": page}).json()['data']
    df = pd.concat([df,pd.DataFrame(res)])


In [None]:
print(f"Daframe possui {df.shape[0]} linhas e {df.shape[1]} colunas")
print(f"Colunas: {df.columns}")

Daframe possui 25 linhas e 36 colunas
Colunas: Index(['mal_id', 'url', 'images', 'trailer', 'approved', 'titles', 'title',
       'title_english', 'title_japanese', 'title_synonyms', 'type', 'source',
       'episodes', 'status', 'airing', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'broadcast', 'producers', 'licensors',
       'studios', 'genres', 'explicit_genres', 'themes', 'demographics'],
      dtype='object')


Remoção de colunas não uteis

In [None]:
df = df.drop(columns=['url','images','trailer','approved','titles','title','title_english', 'title_japanese', 
                      'title_synonyms','type','source','aired','members','synopsis','background','season','broadcast',
                      'broadcast', 'producers', 'licensors', 'studios','explicit_genres','themes','status'])

In [None]:
df.isna().any()

mal_id          False
episodes         True
airing          False
duration        False
rating          False
score           False
scored_by       False
rank            False
popularity      False
favorites       False
year             True
genres          False
demographics    False
dtype: bool

Algumas colunas precisam de tratamento:
<ul>
  <li>Algumas linhas possuem Episodes, Rating, Score, Score_by, Rank ou Year assinalado como NaN</li>
  <li>Genres, Themes, Explicit Themes deveme ser convertidas para codificação One Hot</li>
  <li>Duration possui células com formato inconstante</li>
  <li>Rating possui strings que devem ser convertidas para formatos numéricos</li>
</ul>

### Tratamento de valores NaN

In [None]:
df.loc[df.isna().any(axis=1)]['popularity'].mean()

408.4

Como os animes com valores NaN são em média muito pouco poulares, vamos retirá-los do dataframe

In [None]:
df = df.dropna()
df.shape

(20, 13)

### Tratamento de coluna Duration


In [None]:
df['duration'].unique()

array(['24 min per ep', '25 min per ep', '23 min per ep', '27 min per ep',
       '22 min per ep'], dtype=object)

Basta excluir "min per ep" de cada célula

In [None]:
df['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)',expand=False))
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,year,genres,demographics
0,1,26.0,False,24,R - 17+ (violence & profanity),8.75,941901,43,43,80709,1998.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...",[]
2,6,26.0,False,24,PG-13 - Teens 13 or older,8.22,365783,333,248,15623,1998.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
3,7,26.0,False,25,PG-13 - Teens 13 or older,7.24,43377,2944,1828,632,2002.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...",[]
4,8,52.0,False,23,PG - Children,6.93,6471,4438,5252,15,2004.0,"[{'mal_id': 2, 'type': 'anime', 'name': 'Adven...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
5,15,145.0,False,23,PG-13 - Teens 13 or older,7.91,87868,741,1279,2031,2005.0,"[{'mal_id': 30, 'type': 'anime', 'name': 'Spor...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
6,16,24.0,False,23,PG-13 - Teens 13 or older,8.0,82811,616,882,4162,2005.0,"[{'mal_id': 4, 'type': 'anime', 'name': 'Comed...","[{'mal_id': 43, 'type': 'anime', 'name': 'Jose..."
7,17,52.0,False,23,PG-13 - Teens 13 or older,7.55,13158,1636,4327,241,2002.0,"[{'mal_id': 4, 'type': 'anime', 'name': 'Comed...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
8,18,24.0,False,27,PG-13 - Teens 13 or older,8.16,102532,410,1270,1286,2004.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...","[{'mal_id': 42, 'type': 'anime', 'name': 'Sein..."
9,19,74.0,False,24,R+ - Mild Nudity,8.88,395859,24,136,50496,2004.0,"[{'mal_id': 8, 'type': 'anime', 'name': 'Drama...","[{'mal_id': 42, 'type': 'anime', 'name': 'Sein..."
10,20,220.0,False,23,PG-13 - Teens 13 or older,7.99,1932454,625,8,78429,2002.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."


Visualização de valores de "duration"

In [None]:
df['duration'].unique()

array([24, 25, 23, 27, 22], dtype=int64)

### Rating


In [None]:
df['rating'].unique()

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'PG - Children', 'R+ - Mild Nudity'], dtype=object)

Conversão de classificações indicativas para valores numéricos

In [None]:
replacement = {'R - 17+ (violence & profanity)': 18,  
                'R+ - Mild Nudity': 16,
                'PG-13 - Teens 13 or older': 14,
                'PG - Children':10,
                'G - All Ages': 0,
                None: 0
                }
df['rating']=df['rating'].replace(replacement)
df['rating'].unique()

array([18, 14, 10, 16], dtype=int64)

Pelo amor de Deus, vamos retirar hentais do dataset

In [None]:
drop_rows = df[ df['rating']=='Rx - Hentai' ].index
df.drop(drop_rows , inplace=True)

### One Hot Ecoding

Conversão de json's das células para listas

In [None]:
df['genres'] =  df['genres'].apply(lambda x: [entry['name'] for entry in x])

Conversão de listas das células para strings

In [None]:
df['genres'] =  df['genres'].apply(lambda x: ','.join(map(str, x)))


Visualização do dataset

In [None]:
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,year,genres,demographics
0,1,26.0,False,24,18,8.75,941901,43,43,80709,1998.0,"Action,Award Winning,Sci-Fi",[]
2,6,26.0,False,24,14,8.22,365783,333,248,15623,1998.0,"Action,Adventure,Sci-Fi","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
3,7,26.0,False,25,14,7.24,43377,2944,1828,632,2002.0,"Action,Drama,Mystery,Supernatural",[]
4,8,52.0,False,23,10,6.93,6471,4438,5252,15,2004.0,"Adventure,Fantasy,Supernatural","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
5,15,145.0,False,23,14,7.91,87868,741,1279,2031,2005.0,Sports,"[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
6,16,24.0,False,23,14,8.0,82811,616,882,4162,2005.0,"Comedy,Drama,Romance","[{'mal_id': 43, 'type': 'anime', 'name': 'Jose..."
7,17,52.0,False,23,14,7.55,13158,1636,4327,241,2002.0,"Comedy,Slice of Life,Sports","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
8,18,24.0,False,27,14,8.16,102532,410,1270,1286,2004.0,"Action,Drama","[{'mal_id': 42, 'type': 'anime', 'name': 'Sein..."
9,19,74.0,False,24,16,8.88,395859,24,136,50496,2004.0,"Drama,Mystery,Suspense","[{'mal_id': 42, 'type': 'anime', 'name': 'Sein..."
10,20,220.0,False,23,14,7.99,1932454,625,8,78429,2002.0,"Action,Adventure,Fantasy","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."


One Hot Encoding com gêneros

In [None]:
one_hot_encoded = df['genres'].str.get_dummies(',')
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('genres',axis=1)

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,...,Fantasy,Gourmet,Horror,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense
0,1,26.0,False,24,18,8.75,941901,43,43,80709,...,0,0,0,0,0,1,0,0,0,0
2,6,26.0,False,24,14,8.22,365783,333,248,15623,...,0,0,0,0,0,1,0,0,0,0
3,7,26.0,False,25,14,7.24,43377,2944,1828,632,...,0,0,0,1,0,0,0,0,1,0
4,8,52.0,False,23,10,6.93,6471,4438,5252,15,...,1,0,0,0,0,0,0,0,1,0
5,15,145.0,False,23,14,7.91,87868,741,1279,2031,...,0,0,0,0,0,0,0,1,0,0
6,16,24.0,False,23,14,8.0,82811,616,882,4162,...,0,0,0,0,1,0,0,0,0,0
7,17,52.0,False,23,14,7.55,13158,1636,4327,241,...,0,0,0,0,0,0,1,1,0,0
8,18,24.0,False,27,14,8.16,102532,410,1270,1286,...,0,0,0,0,0,0,0,0,0,0
9,19,74.0,False,24,16,8.88,395859,24,136,50496,...,0,0,0,1,0,0,0,0,0,1
10,20,220.0,False,23,14,7.99,1932454,625,8,78429,...,1,0,0,0,0,0,0,0,0,0


One Hot Encoding com demographics

In [None]:
df['demographics'] =  df['demographics'].apply(lambda x: [entry['name'] for entry in x])
df['demographics'] =  df['demographics'].apply(lambda x: ','.join(map(str, x)))


In [None]:
one_hot_encoded = df['demographics'].str.get_dummies(',')
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('demographics',axis=1)
one_hot_encoded.columns


Index(['Josei', 'Seinen', 'Shounen'], dtype='object')

## Exportação de dataframe

In [None]:
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,...,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense,Josei,Seinen,Shounen
0,1,26.0,False,24,18,8.75,941901,43,43,80709,...,0,0,1,0,0,0,0,0,0,0
2,6,26.0,False,24,14,8.22,365783,333,248,15623,...,0,0,1,0,0,0,0,0,0,1
3,7,26.0,False,25,14,7.24,43377,2944,1828,632,...,1,0,0,0,0,1,0,0,0,0
4,8,52.0,False,23,10,6.93,6471,4438,5252,15,...,0,0,0,0,0,1,0,0,0,1
5,15,145.0,False,23,14,7.91,87868,741,1279,2031,...,0,0,0,0,1,0,0,0,0,1
6,16,24.0,False,23,14,8.0,82811,616,882,4162,...,0,1,0,0,0,0,0,1,0,0
7,17,52.0,False,23,14,7.55,13158,1636,4327,241,...,0,0,0,1,1,0,0,0,0,1
8,18,24.0,False,27,14,8.16,102532,410,1270,1286,...,0,0,0,0,0,0,0,0,1,0
9,19,74.0,False,24,16,8.88,395859,24,136,50496,...,1,0,0,0,0,0,1,0,1,0
10,20,220.0,False,23,14,7.99,1932454,625,8,78429,...,0,0,0,0,0,0,0,0,0,1


In [None]:
filepath = Path('./result')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)

In [None]:
df.columns

Index(['mal_id', 'episodes', 'airing', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'favorites', 'year', 'genres',
       'demographics', 'Action', 'Adventure', 'Avant Garde', 'Award Winning',
       'Comedy', 'Drama', 'Ecchi', 'Fantasy', 'Gourmet', 'Horror', 'Mystery',
       'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural',
       'Suspense', 'Josei', 'Seinen', 'Shounen'],
      dtype='object')

In [None]:
for col in df.columns:
    try:
        print(f"{col}: {df[col].median()}")
    except:
        pass
        

mal_id: 21.0
episodes: 26.0
airing: 0.0
duration: 23.5
rating: 14.0
score: 7.9
scored_by: 83006.0
rank: 754.5
popularity: 1267.5
favorites: 2536.0
year: 2004.0
Action: 1.0
Adventure: 0.0
Avant Garde: 0.0
Award Winning: 0.0
Comedy: 0.0
Drama: 0.0
Ecchi: 0.0
Fantasy: 0.0
Gourmet: 0.0
Horror: 0.0
Mystery: 0.0
Romance: 0.0
Sci-Fi: 0.0
Slice of Life: 0.0
Sports: 0.0
Supernatural: 0.0
Suspense: 0.0
Josei: 0.0
Seinen: 0.0
Shounen: 0.0


In [None]:
print(df.shape)

(20, 33)


In [None]:
lista = ['Action', 'Adventure', 'Avant Garde', 'Award Winning', 
       'Comedy', 'Drama', 'Ecchi', 'Fantasy', 'Gourmet',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',]

In [None]:
for  cat in lista:
    print(f'{cat}: {df[cat].sum()}')

Action: 12
Adventure: 5
Avant Garde: 1
Award Winning: 2
Comedy: 5
Drama: 8
Ecchi: 1
Fantasy: 3
Gourmet: 1
Horror: 1
Mystery: 2
Romance: 2
Sci-Fi: 6
Slice of Life: 1
Sports: 4


In [None]:
# df.columns
df = df.drop(columns=["favorites","Avant Garde","Ecchi",
              "Gourmet","genres"])

df.columns

Index(['mal_id', 'episodes', 'airing', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'year', 'demographics', 'Action',
       'Adventure', 'Award Winning', 'Comedy', 'Drama', 'Fantasy', 'Horror',
       'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',
       'Supernatural', 'Suspense', 'Josei', 'Seinen', 'Shounen'],
      dtype='object')

In [None]:
filtro = df['year'] > 2011.0

df = df[filtro]

df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,year,...,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense,Josei,Seinen,Shounen


In [None]:
filepath = Path('./result.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)