In [6]:
import pandas as pd
import os 
from tqdm import tqdm
import gzip
import csv
banned_books_df = pd.read_csv('merged.csv')

In [4]:
def decompress_gzip_to_file(compressed_file: str, uncompressed_file: str, chunk_size: int = 4 * 1024 * 1024) -> None:
	"""
	Decompress a gzipped file to a specified destination.
	
	Args:
		compressed_file (str): Path to the gzipped file.
		uncompressed_file (str): Path where the uncompressed data should be written.
		chunk_size (int, optional): Size of the chunks to be read from the compressed file. Defaults to 4MB.

	Returns:
		None
	"""
	if not os.path.exists(uncompressed_file):
		with gzip.open(compressed_file, 'rb') as f_in, open(uncompressed_file, 'wb') as f_out, tqdm(
			unit="MB", total=os.path.getsize(compressed_file) / (chunk_size)) as pbar:
			while True:
				chunk = f_in.read(chunk_size)
				if not chunk:
					break
				f_out.write(chunk)
				pbar.update(1)

# Example usage
compressed_file = 'hathi_full_20241201.txt.gz'
uncompressed_file = 'hathi_full_20241201.txt'
decompress_gzip_to_file(compressed_file, uncompressed_file)

1362MB [00:06, 203.40MB/s]                                       


In [10]:
# Load the column headers from the hathi_field_list.txt file
headers_file = 'hathi_field_list.txt'
with open(headers_file, 'r') as f:
    headers = f.read().strip().split('\t')

# Load the uncompressed file into a DataFrame using the headers
try:
    hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)
except pd.errors.ParserError as e:
    print(f"Error reading the file: {e}")



  hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)
  hathi_df = pd.read_csv(uncompressed_file, delimiter='\t', names=headers, quoting=csv.QUOTE_NONE, error_bad_lines=False)


In [11]:
hathi_df[0:1].to_dict()

{'htid': {0: 'mdp.39015018415946'},
 'access': {0: 'deny'},
 'rights': {0: 'ic'},
 'ht_bib_key': {0: 1},
 'description': {0: 'v.5'},
 'source': {0: 'MIU'},
 'source_bib_num': {0: '990000000010106381'},
 'oclc_num': {0: '2779601'},
 'isbn': {0: '8081281584,9788081281587'},
 'issn': {0: nan},
 'lccn': {0: '70518371'},
 'title': {0: 'Slovenské vyst̕ahovalectvo / Zost. František Bielik a Elo Rákoš.'},
 'imprint': {0: 'Matica Slovenská, SAV, t. Svornost̕, 1969-'},
 'rights_reason_code': {0: 'bib'},
 'rights_timestamp': {0: '2011-09-15 04:30:52'},
 'us_gov_doc_flag': {0: 0},
 'rights_date_used': {0: 9999},
 'pub_place': {0: 'xo '},
 'lang': {0: 'slo'},
 'bib_fmt': {0: 'BK'},
 'collection_code': {0: 'MIU'},
 'content_provider_code': {0: 'umich'},
 'responsible_entity_code': {0: 'umich'},
 'digitization_agent_code': {0: 'google'},
 'access_profile_code': {0: 'google'},
 'author': {0: 'Bielik, František,'}}

In [12]:
banned_books_df[['Title', 'Author']].drop_duplicates()

Unnamed: 0,Title,Author
0,"""Multiplication Is for White People"": Raising ...","Delpit, Lisa"
1,#BlackLivesMatter: Protesting Racism,"Thomas, Rachael L."
3,#Hockey,"Ukazu, Ngozi"
5,"#Hockey (Check, Please! Series)","Ukazu, Ngozi"
8,#MurderTrending (MurderTrending Series),"McNeil, Gretchen"
...,...,...
5889,yolo (Internet Girls Series),"Myracle, Lauren"
5890,"¡Solo pregunta!: Sé Diferente, Sé Valiente, Sé Tú","Sotomayor, Sonia"
5891,¡Vámonos! Let's Go!,"Lainez, Rene Colato"
5892,¿De Dónde Eres?/Where Are You From?,"Mendez, Yamile Saied"
