In [1]:
import gzip
import json
import pandas as pd
from collections import Counter

In [2]:
# 데이터 경로
file_path = "../data/raw/goodreads_books.json.gz"

# 처음 5개만 읽기
books_sample = []
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 5:
            break
        books_sample.append(json.loads(line))

# 첫 번째 책 구조 확인
print("=== 첫 번째 책 데이터 ===")
print(json.dumps(books_sample[0], indent=2, ensure_ascii=False))

=== 첫 번째 책 데이터 ===
{
  "isbn": "0312853122",
  "text_reviews_count": "1",
  "series": [],
  "country_code": "US",
  "language_code": "",
  "popular_shelves": [
    {
      "count": "3",
      "name": "to-read"
    },
    {
      "count": "1",
      "name": "p"
    },
    {
      "count": "1",
      "name": "collection"
    },
    {
      "count": "1",
      "name": "w-c-fields"
    },
    {
      "count": "1",
      "name": "biography"
    }
  ],
  "asin": "",
  "is_ebook": "false",
  "average_rating": "4.00",
  "kindle_asin": "",
  "similar_books": [],
  "description": "",
  "format": "Paperback",
  "link": "https://www.goodreads.com/book/show/5333265-w-c-fields",
  "authors": [
    {
      "author_id": "604031",
      "role": ""
    }
  ],
  "publisher": "St. Martin's Press",
  "num_pages": "256",
  "publication_day": "1",
  "isbn13": "9780312853129",
  "publication_month": "9",
  "edition_information": "",
  "publication_year": "1984",
  "url": "https://www.goodreads.com/book/show/5

In [3]:
# 1만 개 샘플로 품질 확인
description_count = 0
language_count = Counter()
total_checked = 0

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10000:
            break
        book = json.loads(line)
        total_checked += 1
        
        # description 체크
        if book.get('description', '').strip():
            description_count += 1
        
        # language 체크
        lang = book.get('language_code', '') or 'unknown'
        language_count[lang] += 1

print(f"=== 1만 개 샘플 분석 결과 ===")
print(f"description 있는 책: {description_count:,} / {total_checked:,} ({description_count/total_checked*100:.1f}%)")
print(f"\n=== 언어 분포 (상위 10개) ===")
for lang, count in language_count.most_common(10):
    print(f"  {lang}: {count:,} ({count/total_checked*100:.1f}%)")

=== 1만 개 샘플 분석 결과 ===
description 있는 책: 8,250 / 10,000 (82.5%)

=== 언어 분포 (상위 10개) ===
  unknown: 4,457 (44.6%)
  eng: 3,050 (30.5%)
  en-US: 402 (4.0%)
  spa: 235 (2.4%)
  ita: 221 (2.2%)
  en-GB: 218 (2.2%)
  ara: 156 (1.6%)
  fre: 126 (1.3%)
  ger: 124 (1.2%)
  por: 121 (1.2%)


In [4]:
# 영어 언어 코드
english_codes = {'eng', 'en-US', 'en-GB'}

# 전체 스캔 (시간 좀 걸려요)
total_count = 0
valid_count = 0

print("전체 데이터 스캔 중... (2-3분 걸릴 수 있어요)")

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        total_count += 1
        book = json.loads(line)
        
        has_description = bool(book.get('description', '').strip())
        is_english = book.get('language_code', '') in english_codes
        
        if has_description and is_english:
            valid_count += 1
        
        # 진행상황 표시
        if total_count % 500000 == 0:
            print(f"  {total_count:,}개 확인...")

print(f"\n=== 결과 ===")
print(f"전체 책: {total_count:,}")
print(f"조건 충족 (영어 + description): {valid_count:,} ({valid_count/total_count*100:.1f}%)")

전체 데이터 스캔 중... (2-3분 걸릴 수 있어요)
  500,000개 확인...
  1,000,000개 확인...
  1,500,000개 확인...
  2,000,000개 확인...

=== 결과 ===
전체 책: 2,360,655
조건 충족 (영어 + description): 782,863 (33.2%)
