# Experimenting with Titles & Genres Metadata

This notebook is for experimenting with title and genres metadata. The goal is to augment the existing banned books dataset with additional metadata. To run this notebook, you need  to download the banned books and hathifiles dataset, and also the following libraries:

- pandas
- tqdm
- htrc-feature-reader

## Load Python Libraries

In [1]:
import pandas as pd
import os 
from tqdm import tqdm
import gzip
import csv
from htrc_features import Volume

## Load HathiFiles

In [2]:
def decompress_gzip_to_file(compressed_file: str, uncompressed_file: str, chunk_size: int = 4 * 1024 * 1024) -> None:
	"""
	Decompress a gzipped file to a specified destination.
	
	Args:
		compressed_file (str): Path to the gzipped file.
		uncompressed_file (str): Path where the uncompressed data should be written.
		chunk_size (int, optional): Size of the chunks to be read from the compressed file. Defaults to 4MB.

	Returns:
		None
	"""
	if not os.path.exists(uncompressed_file):
		with gzip.open(compressed_file, 'rb') as f_in, open(uncompressed_file, 'wb') as f_out, tqdm(
			unit="MB", total=os.path.getsize(compressed_file) / (chunk_size)) as pbar:
			while True:
				chunk = f_in.read(chunk_size)
				if not chunk:
					break
				f_out.write(chunk)
				pbar.update(1)

# Example usage
compressed_file = 'hathi_full_20241201.txt.gz'
uncompressed_file = 'hathi_full_20241201.txt'
decompress_gzip_to_file(compressed_file, uncompressed_file)

In [3]:
# Load the column headers from the hathi_field_list.txt file
headers_file = 'hathi_field_list.txt'
with open(headers_file, 'r') as f:
    headers = f.read().strip().split('\t')

# Load the uncompressed file into a DataFrame using the headers
try:
    hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)
except pd.errors.ParserError as e:
    print(f"Error reading the file: {e}")



  hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)
  hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)


In [4]:
hathi_df.head()

Unnamed: 0,htid,access,rights,ht_bib_key,description,source,source_bib_num,oclc_num,isbn,issn,...,rights_date_used,pub_place,lang,bib_fmt,collection_code,content_provider_code,responsible_entity_code,digitization_agent_code,access_profile_code,author
0,mdp.39015018415946,deny,ic,1,v.5,MIU,990000000010106381,2779601,80812815849788081281587,,...,9999,xo,slo,BK,MIU,umich,umich,google,google,"Bielik, František,"
1,mdp.39015066356547,deny,ic,1,v.1,MIU,990000000010106381,2779601,80812815849788081281587,,...,9999,xo,slo,BK,MIU,umich,umich,google,google,"Bielik, František,"
2,mdp.39015066356406,deny,ic,1,v.2,MIU,990000000010106381,2779601,80812815849788081281587,,...,9999,xo,slo,BK,MIU,umich,umich,google,google,"Bielik, František,"
3,mdp.39015066356695,deny,ic,1,v.3,MIU,990000000010106381,2779601,80812815849788081281587,,...,9999,xo,slo,BK,MIU,umich,umich,google,google,"Bielik, František,"
4,mdp.39015066356554,deny,ic,1,v.4,MIU,990000000010106381,2779601,80812815849788081281587,,...,9999,xo,slo,BK,MIU,umich,umich,google,google,"Bielik, František,"


## Load Banned Books

In [5]:
banned_books_df = pd.read_csv('merged.csv')

In [7]:
dedup_banned_books_df = banned_books_df[['Title', 'Author']].drop_duplicates()

## Filter HathiFiles By Banned Books

In [8]:
print(hathi_df.columns)

Index(['htid', 'access', 'rights', 'ht_bib_key', 'description', 'source',
       'source_bib_num', 'oclc_num', 'isbn', 'issn', 'lccn', 'title',
       'imprint', 'rights_reason_code', 'rights_timestamp', 'us_gov_doc_flag',
       'rights_date_used', 'pub_place', 'lang', 'bib_fmt', 'collection_code',
       'content_provider_code', 'responsible_entity_code',
       'digitization_agent_code', 'access_profile_code', 'author'],
      dtype='object')


In [12]:
hathi_df[(hathi_df.title.isin(dedup_banned_books_df.Title)) & (hathi_df.author.isin(dedup_banned_books_df.Author))][['title', 'author']]

Unnamed: 0,title,author
1469876,Glass,
1469877,Glass,
1469878,Glass,
1469879,Glass,
1469880,Glass,
...,...,...
13075686,Smoke,
13075687,Smoke,
13437670,Kaleidoscope,
13441335,The Butterfly,
