In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# MERGE THE DATA SETS:

csv_files = [
    'Database/all_subjects_works.csv',
    'Database/editions_by_decade.csv',
    'Database/googlebooks_100000_extended.csv'
]

# Read each csv in one DataFrame and save them on a list
df_list = [pd.read_csv(path) for path in csv_files]


In [8]:
# Concatenates and resets the index to unify the rows

df_combined = pd.concat(df_list, ignore_index=True)
df_combined

Unnamed: 0,work_key,title,first_publish_year,edition_count,subjects,authors,decade,edition_key,subtitle,publish_date,...,previewLink,infoLink,canonicalVolumeLink,categories,industryIdentifiers,description,imageLinks_thumbnail,imageLinks_smallThumbnail,readingModes_text,readingModes_image
0,/works/OL66554W,Pride and Prejudice,1813.0,4036.0,"Fiction, Romance, Historical, Regency;British ...",Jane Austen,,,,,...,,,,,,,,,,
1,/works/OL21177W,Wuthering Heights,1846.0,2850.0,British and irish fiction (fictional works by ...,Emily Brontë,,,,,...,,,,,,,,,,
2,/works/OL53908W,Adventures of Huckleberry Finn,1876.0,2552.0,adventure and adventurers;Adventure stories;Am...,Mark Twain,,,,,...,,,,,,,,,,
3,/works/OL66513W,Emma,1815.0,2261.0,Social life and customs;Mate selection;Fiction...,Jane Austen,,,,,...,,,,,,,,,,
4,/works/OL8193478W,Oliver Twist,1822.0,2209.0,Bildungsromans;Boys;Brigands and robbers;Briti...,Charles Dickens,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110927,,The Muppets,,,,,,,,,...,http://books.google.es/books?id=FX3bDAEACAAJ&d...,http://books.google.es/books?id=FX3bDAEACAAJ&d...,https://books.google.com/books/about/The_Muppe...,,OTHER:OCLC:946618359,"While on vacation in Los Angeles, Walter and h...",,,False,False
110928,,Truckloads of Fun,,,,,,,,,...,http://books.google.es/books?id=orSvnQEACAAJ&d...,http://books.google.es/books?id=orSvnQEACAAJ&d...,https://books.google.com/books/about/Truckload...,Children's,OTHER:OCLC:845773396,,,,False,False
110929,,A Giant Adventure,,,,Cbn,,,David and Goliath,,...,http://books.google.es/books?id=N5MOswEACAAJ&d...,http://books.google.es/books?id=N5MOswEACAAJ&d...,https://books.google.com/books/about/A_Giant_A...,Religion,ISBN_10:1943541051;ISBN_13:9781943541058,When Chris plays guitar for a school band audi...,http://books.google.com/books/content?id=N5MOs...,http://books.google.com/books/content?id=N5MOs...,False,False
110930,,Tread Softly,,,,Nicola Reddy,,,Classic Irish Poems for Children,,...,http://books.google.es/books?id=xsjtzwEACAAJ&d...,http://books.google.es/books?id=xsjtzwEACAAJ&d...,https://books.google.com/books/about/Tread_Sof...,,ISBN_10:1788494113;ISBN_13:9781788494113,"With poems by W.B. Yeats, Oscar Wilde and Jame...",http://books.google.com/books/content?id=xsjtz...,http://books.google.com/books/content?id=xsjtz...,False,False


In [7]:
# Save the new dataset: 

df_combined.to_csv('Database/all_libraries_combined.csv', index=False)
print('CSV merged/all_libraris_combined.csv')


CSV merged/all_libraris_combined.csv


In [5]:
df_combined.shape

(110932, 34)

In [9]:
#  Convert first_publish_year to integer (leave NaN where not applicable)
df_combined['first_publish_year'] = pd.to_numeric(df_combined['first_publish_year'], errors='coerce').astype('Int64')

# Convert publish_date to integer
df_combined['publish_date'] = pd.to_numeric(df_combined['publish_date'], errors='coerce').astype('Int64')

# Extract the starting year from decade (e.g. ‘2020-2025’ → 2020).
df_combined['decade_start'] = (
    df_combined['decade']
      .str.split('-', expand=True)[0]
      .pipe(pd.to_numeric, errors='coerce')
      .astype('Int64')
)


In [10]:
# Fill first with publish_date, then first_publish_year and finally decade_start.

df_combined['year'] = (
    df_combined['publish_date']
      .fillna(df_combined['first_publish_year'])
      .fillna(df_combined['decade_start'])
      .astype('Int64')
)


In [11]:
# Filter range 2020-2025

mask = df_combined['year'].between(2020, 2025)
df_2020_2025 = df_combined[mask]


In [12]:
df_2020_2025

Unnamed: 0,work_key,title,first_publish_year,edition_count,subjects,authors,decade,edition_key,subtitle,publish_date,...,canonicalVolumeLink,categories,industryIdentifiers,description,imageLinks_thumbnail,imageLinks_smallThumbnail,readingModes_text,readingModes_image,decade_start,year
341,/works/OL20665410W,The Mirror and the Light,2020,26.0,"Fiction, historical;England, fiction;Historica...",Hilary Mantel,,,,,...,,,,,,,,,,2020
717,/works/OL20855885W,The Pull of the Stars,2020,11.0,LGBTQ novels;Stonewall Book Awards;nyt:hardcov...,Emma Donoghue,,,,,...,,,,,,,,,,2020
750,/works/OL21799623W,The Love Songs of W.E.B. Du Bois,2021,11.0,Contemporary Women;heritage;nyt:combined-print...,Honorée Fanonne Jeffers,,,,,...,,,,,,,,,,2021
888,/works/OL34059667W,The Heaven & Earth Grocery Store,2023,9.0,nyt:combined-print-and-e-book-fiction=2023-08-...,James McBride,,,,,...,,,,,,,,,,2023
975,/works/OL20658092W,The Mountains Sing,2020,8.0,"English literature;Fiction, sagas;Fiction, his...",Phan Quế Mai Nguyễn;Nguyễn Phan Quế Mai;Que Ma...,,,,,...,,,,,,,,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106799,/works/OL43377444W,Médias audiovisuels et tolérance administrativ...,,,Broadcasting policy;History;Broadcasting;Law a...,,2020-2025,/books/OL59080465M,enjeux communicationnels et logiques d'acteurs,2021,...,,,,,,,,,2020,2021
106812,/works/OL43405025W,L'Italia entra in guerra,,,"World War, 1914-1918;Campaigns;World War, 1939...",,2020-2025,/books/OL59121268M,10-25 giugno 1940 : 15 giorni che sconvolsero ...,2020,...,,,,,,,,,2020,2020
106820,/works/OL43413412W,ערך לחם,,,Cabala;History,;;,2020-2025,/books/OL59133605M,מערכות וזכרונות באגדה ובהלכה בדרך פרד״ס...כולל...,2020,...,,,,,,,,,2020,2020
106823,/works/OL43415335W,Гений общения,,,Russian poetry;History and criticism;Russian P...,;,2020-2025,/books/OL59136187M,I.I. Kozlov v dialoge s sovremennikami : monog...,2021,...,,,,,,,,,2020,2021


In [25]:
# Let's check the subject info:

subject_info = df_2020_2025['subjects'].dropna().value_counts()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
print(subject_info.to_string())


subjects
Children's fiction                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [29]:
with open('Database/subjects_counts_full.csv', 'w', encoding='utf-8') as f:
    f.write(subject_info.to_string())
print("Saving data/subjects_counts_full.csv")


Saving data/subjects_counts_full.csv
