In [1]:
import pandas as pd
from io import StringIO

### Carregando o dataset

In [2]:
def load_dataset_from_json(path: str, num_lines: int = 1000) -> pd.DataFrame:
    """
    Load a dataset from a JSON file into a pandas DataFrame.

    Parameters:
    path: The file path of the JSON file to be read
    num_lines:: The maximum number of lines to read from the file

    Returns:
    A pandas DataFrame containing the parsed data from the JSON file
    """

    lines = []
    with open(path, 'r') as file:
        for i, line in enumerate(file):
            if i >= num_lines:
                break
            lines.append(line.strip())
    df = pd.read_json(StringIO('[' + ','.join(lines) + ']'), lines=False).reset_index(drop=True)
    return df

In [None]:
num_lines = 500000

df_authors = load_dataset_from_json('../data/raw_data/goodreads_book_authors.json', num_lines)
df_genres = load_dataset_from_json('../data/raw_data/goodreads_book_genres_initial.json', num_lines)
df_books = load_dataset_from_json('../data/raw_data/goodreads_books.json', num_lines)

In [4]:
df_books = df_books.dropna(subset=['ratings_count'])
df_books['ratings_count'] = pd.to_numeric(df_books['ratings_count'], errors='coerce').astype('Int64')
df_books = df_books[df_books['ratings_count'] >= 100]

In [5]:
df_books.shape[0]

107302

### Merge dos datasets carregados para criar um dataset mais completo

O primeiro passo para a criação de um dataset mais completo é remover as features irrelevantes e renomear as features que possivelmente serão utilizadas futuramente para a criação de um modelo preditivo

#### Autores

In [6]:
df_authors.rename(columns={
    'name': 'author_name',
    'average_rating': 'author_rating',
    'text_reviews_count': 'author_reviews_count',
    'ratings_count': 'author_ratings_count'},inplace=True)

In [7]:
df_authors.head()

Unnamed: 0,author_rating,author_id,author_reviews_count,author_name,author_ratings_count
0,3.98,604031,7,Ronald J. Fields,49
1,4.08,626222,28716,Anita Diamant,546796
2,3.92,10333,5075,Barbara Hambly,122118
3,3.68,9212,36262,Jennifer Weiner,888522
4,3.82,149918,96,Nigel Pennick,1740


In [8]:
# Verificando os tipos das colunas do dataframe
df_authors.dtypes

author_rating           float64
author_id                 int64
author_reviews_count      int64
author_name              object
author_ratings_count      int64
dtype: object

#### Gêneros

Para a simplificação do modelo, o gênero a ser considerado de cada livro vai ser apenas o gênero principal do livro, ou seja, o primeiro gênero presente na coluna

In [9]:
df_genres['genres'] = df_genres['genres'].apply(lambda x: next(iter(x.keys())) if isinstance(x, dict) and x else None)

In [10]:
df_genres.rename(columns={'genres': 'book_genre'},inplace=True)

In [11]:
df_genres.head()

Unnamed: 0,book_id,book_genre
0,5333265,"history, historical fiction, biography"
1,1333909,fiction
2,7327624,"fantasy, paranormal"
3,6066819,fiction
4,287140,non-fiction


In [12]:
# Verificando os tipos das colunas do dataframe
df_genres.dtypes

book_id        int64
book_genre    object
dtype: object

#### Livros

Assim como nos gêneros de cada livro, o autor considerado vai ser apena o autor principal, ou seja, o primeiro autor presente na coluna

In [13]:
df_books.drop(['isbn', 'series', 'popular_shelves', 'asin', 'kindle_asin', 'description', 'link', 'publisher', 'publication_day',
               'isbn13', 'publication_month', 'similar_books', 'edition_information', 'url', 'image_url', 'work_id',
               'title', 'country_code'], axis=1, inplace=True)

df_books['authors'] = df_books['authors'].apply(lambda x: x[0]['author_id'] if isinstance(x, list) and len(x) > 0 else None)

In [14]:
df_books.rename(columns={
    'authors': 'author_id',
    'text_reviews_count': 'book_text_reviews_count',
    'average_rating': 'book_rating',
    'format': 'book_format',
    'ratings_count': 'book_ratings_count',
    'title_without_series': 'book_title'
    }, inplace=True)

In [15]:
df_books.head()

Unnamed: 0,book_text_reviews_count,language_code,is_ebook,book_rating,book_format,author_id,num_pages,publication_year,book_id,book_ratings_count,book_title
2,7,eng,False,4.03,Hardcover,10333,600,1987.0,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,3282,eng,False,3.49,Hardcover,9212,368,2009.0,6066819,51184,Best Friends Forever
9,39,,False,3.81,Paperback,2983296,147,2000.0,287149,986,The Devil's Notebook
10,15,,False,3.93,Hardcover,37778,400,2009.0,6066814,186,"Crowner Royal (Crowner John Mystery, #13)"
11,60,eng,True,4.33,,242185,318,,33394837,269,The House of Memory (Pluto's Snitch #2)


In [16]:
# Verificando os tipos das colunas do dataframe
df_books.dtypes

book_text_reviews_count    object
language_code              object
is_ebook                   object
book_rating                object
book_format                object
author_id                  object
num_pages                  object
publication_year           object
book_id                     int64
book_ratings_count          Int64
book_title                 object
dtype: object

Além disso, é interessante alterar o tipo deas colunas que possuem valor numérico

In [17]:
df_books['book_text_reviews_count'] = pd.to_numeric(df_books['book_text_reviews_count'], errors='coerce').astype('Int64')
df_books['book_rating'] = pd.to_numeric(df_books['book_rating'], errors='coerce').astype('float64')
df_books['author_id'] = pd.to_numeric(df_books['author_id'], errors='coerce').astype('Int64')
df_books['num_pages'] = pd.to_numeric(df_books['num_pages'], errors='coerce').astype('Int64')
df_books['publication_year'] = pd.to_numeric(df_books['publication_year'], errors='coerce').astype('Int64')
df_books['book_ratings_count'] = pd.to_numeric(df_books['book_ratings_count'], errors='coerce').astype('Int64')

In [18]:
# Verificando os tipos das colunas do dataframe
df_books.dtypes

book_text_reviews_count      Int64
language_code               object
is_ebook                    object
book_rating                float64
book_format                 object
author_id                    Int64
num_pages                    Int64
publication_year             Int64
book_id                      int64
book_ratings_count           Int64
book_title                  object
dtype: object

### Juntanto os datasets

Os merges realizados estão no formato 'left' para garantir que todos os livros presentes em `df_books` sejam mantidos, mesmo que não haja correspondência deles nos demais DataFrames. Isso pode ser útil em etapas futuras, visto que mesmo sem a devida correspôndencia os dados dos livros podem conter outras informações úteis.

In [19]:
df = pd.merge(df_books, df_genres, on='book_id', how='left')

In [20]:
# Alterando o tipo da coluna 'author_id' em df_authors para possibilitar o merge
df_authors['author_id'] = df_authors['author_id'].astype('Int64')

In [21]:
df = pd.merge(df, df_authors, on='author_id', how='left')

In [22]:
df.head()

Unnamed: 0,book_text_reviews_count,language_code,is_ebook,book_rating,book_format,author_id,num_pages,publication_year,book_id,book_ratings_count,book_title,book_genre,author_rating,author_reviews_count,author_name,author_ratings_count
0,7,eng,False,4.03,Hardcover,10333,600,1987.0,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","fantasy, paranormal",3.92,5075,Barbara Hambly,122118
1,3282,eng,False,3.49,Hardcover,9212,368,2009.0,6066819,51184,Best Friends Forever,fiction,3.68,36262,Jennifer Weiner,888522
2,39,,False,3.81,Paperback,2983296,147,2000.0,287149,986,The Devil's Notebook,non-fiction,3.48,824,Anton Szandor LaVey,12628
3,15,,False,3.93,Hardcover,37778,400,2009.0,6066814,186,"Crowner Royal (Crowner John Mystery, #13)",fiction,3.86,616,Bernard Knight,7836
4,60,eng,True,4.33,,242185,318,,33394837,269,The House of Memory (Pluto's Snitch #2),"fantasy, paranormal",3.95,2906,Carolyn Haines,42549


In [23]:
# Verificando o número de linhas do dataframe
df.shape[0]

107302

In [24]:
# Verificando as colunas do dataframe
df.columns

Index(['book_text_reviews_count', 'language_code', 'is_ebook', 'book_rating',
       'book_format', 'author_id', 'num_pages', 'publication_year', 'book_id',
       'book_ratings_count', 'book_title', 'book_genre', 'author_rating',
       'author_reviews_count', 'author_name', 'author_ratings_count'],
      dtype='object')

### Salvando o Dataframe

In [None]:
df.to_csv('../data/raw_data/goodreads.csv', index=False, sep=';', encoding='utf-8', header=True)