## Wiki aesthetics images

In [1]:
import os
import pandas as pd
import glob
import gzip
import json
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
import seaborn as sns

In [2]:

# Define the base path
base_path = "data/aesthetic_images/"

# Get list of all aesthetic folders
aesthetic_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

# Create a list to store the counts
counts = []

# Count files in each folder and get additional statistics
for aesthetic in aesthetic_folders:
    folder_path = os.path.join(base_path, aesthetic)
    image_files = glob.glob(os.path.join(folder_path, "*"))
    
    # Calculate total size in MB
    total_size_bytes = sum(os.path.getsize(file) for file in image_files)
    total_size_mb = total_size_bytes / (1024 * 1024)
    
    counts.append({
        "aesthetic": aesthetic,
        "image_count": len(image_files),
        "total_size_mb": round(total_size_mb, 2),
        "avg_size_mb": round(total_size_mb / len(image_files), 2) if image_files else 0
    })

# Create a pandas DataFrame
df_image_counts = pd.DataFrame(counts)

# Sort by image count (descending)
df_image_counts = df_image_counts.sort_values("image_count", ascending=False)

# Display the DataFrame
df_image_counts

Unnamed: 0,aesthetic,image_count,total_size_mb,avg_size_mb
11,Frogcore,182,35.86,0.2
15,Kidcore,75,28.39,0.38
7,Dark_Academia,63,17.28,0.27
9,Fairy_Kei,60,7.77,0.13
20,Traumacore,59,19.03,0.32
5,Cottagecore,55,21.11,0.38
8,Ethereal,50,12.76,0.26
22,Vaporwave,47,44.19,0.94
3,Bloomcore,40,11.29,0.28
6,Cyberpunk,33,28.3,0.86


## Goodreads data

In [3]:
BOOKS_PATH = r'..\goodreads\goodreads_books'

# All book datasets
book_files = glob.glob(os.path.join(BOOKS_PATH, "*.gz"))

print(book_files)



['..\\goodreads\\goodreads_books\\goodreads_books.json.gz', '..\\goodreads\\goodreads_books\\goodreads_book_authors.json.gz', '..\\goodreads\\goodreads_books\\goodreads_book_genres_initial.json.gz', '..\\goodreads\\goodreads_books\\goodreads_book_series.json.gz', '..\\goodreads\\goodreads_books\\goodreads_book_works.json.gz']


In [None]:
def process_and_save_chunks(file_path, output_dir, chunk_number=0, chunk_size=50000, max_chunks=None):
    """
    Process large JSON.GZ file in chunks and save each chunk as a CSV
    
    Parameters:
    -----------
    file_path : str
        Path to the input JSON.GZ file
    output_dir : str
        Directory to save the CSV chunks
    chunk_size : int
        Number of records per chunk
    max_chunks : int, optional
        Maximum number of chunks to process
    """
    import os
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    records = []
    
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f)):
            if i <= chunk_number * chunk_size:
                continue
            try:
                # Parse JSON line
                book = json.loads(line.strip())
                records.append(book)
                
                # Process chunk when it reaches the specified size
                if len(records) >= chunk_size:
                    chunk_df = pd.DataFrame(records)
                    
                    # Save this chunk
                    chunk_file = os.path.join(output_dir, f"books_chunk_{chunk_number:03d}.csv")
                    chunk_df.to_csv(chunk_file, index=False)
                    print(f"Saved chunk {chunk_number} with {len(records):,} records to {chunk_file}")
                    
                    # Reset for next chunk
                    records = []
                    chunk_number += 1
                    
                    # Stop if we reached max_chunks
                    if max_chunks and chunk_number >= max_chunks:
                        print(f"Reached maximum number of chunks ({max_chunks})")
                        break
                        
            except json.JSONDecodeError:
                print(f"Error parsing JSON at line {i}")
    
    # Save any remaining records
    if records:
        chunk_df = pd.DataFrame(records)
        chunk_file = os.path.join(output_dir, f"books_chunk_{chunk_number:03d}.csv")
        chunk_df.to_csv(chunk_file, index=False)
        print(f"Saved final chunk {chunk_number} with {len(records):,} records to {chunk_file}")


# process_and_save_chunks(
#     '../goodreads/goodreads_books/goodreads_books.json.gz', 
#     '../goodreads/goodreads_chunks/',
#     chunk_size=100000,
#     chunk_number=16,
#     max_chunks=24 
# )

0it [00:00, ?it/s]

Saved chunk 16 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_016.csv
Saved chunk 17 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_017.csv
Saved chunk 18 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_018.csv
Saved chunk 19 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_019.csv
Saved chunk 20 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_020.csv
Saved chunk 21 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_021.csv
Saved chunk 22 with 100,000 records to ../goodreads/goodreads_chunks/books_chunk_022.csv
Saved final chunk 23 with 60,656 records to ../goodreads/goodreads_chunks/books_chunk_023.csv


In [2]:
import dask.dataframe as dd

# Read all CSV files into a Dask DataFrame
dask_df = dd.read_csv('../goodreads/goodreads_chunks/books_chunk_*.csv')

# Now you can work with this DataFrame much like pandas
# Basic operations are lazy and only computed when needed
print(f"Total rows: {len(dask_df):,}")

# Get basic statistics (triggers computation)
stats = dask_df.describe().compute()
display(stats)

# Example: Get most common publishers
top_publishers = dask_df['publisher'].value_counts().nlargest(20).compute()
display(top_publishers)

# Example: Average rating by year
avg_by_year = dask_df.groupby('publication_year')['average_rating'].mean().compute()

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [17]:
book_crossing = pd.read_csv(r'..\bookcrossing\Book reviews\Book reviews\BX_Books.csv', sep=';', encoding='latin-1')
display(book_crossing.head(), book_crossing.shape)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


(271379, 8)