In [None]:
from google import genai
from google.genai import types
import pathlib
import httpx
from google.colab import userdata

client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))


file1 = pathlib.Path("/content/chunk1.pdf")
file2 = pathlib.Path("/content/chunk2.pdf")
file3 = pathlib.Path("/content/chunk3.pdf")
file4 = pathlib.Path("/content/chunk4.pdf")
file5 = pathlib.Path("/content/chunk5.pdf")
file6 = pathlib.Path("/content/chunk6.pdf")
file7 = pathlib.Path("/content/chunk7.pdf")


prompt = """
Given the PDF of earthquake reports, please perform the following tasks:
1. Systematically parse through each report entry in the document, one by one. Make sure to process every single valid entry. A valid entry is defined as one that is from the March 22 mainshock of the earthquake, at/near 11:44:21.
2. For each valid entry, do the following:
- Extract the specific full address from the location details. For example, from "San Francisco ( 465 California St. )", extract "456 California St, San Francisco, CA 94104". You may not be given information like the zip code, so add it in on your own.
- Extract its associated descriptive details using the exact wording, such as how long the earthquake lasted or what it felt like. Extract the plain text, ignoring any formatting such as pre-existing quotation marks.
- Extract the associated earthquake intensity, typically given as a Roman numeral (I to XII). If an entry lacks a rating, disregard the entry entirely.
- Note that some reports will have an intensity within the entry, while some entries with the same intensities will be grouped together and have the intensity at or near the top of the page. Thus, ensure you check surrounding context.
- Earthquakes reported as "not felt" or similar should be included and assigned a value of 1. For reports with a range of values, take the average of the range.
3. Repeat for all entries in the PDF.
4. As you go, all this information needs to be extracted into a single, clean CSV formatted string with the columns: "Location", "Rating", and "Description". Do not include headers.
"""

def generate_data(earthquake_data_file):
  response = client.models.generate_content(
      model = "gemini-2.5-pro",
      contents=[
          types.Part.from_bytes(
              data=earthquake_data_file.read_bytes(),
              mime_type='application/pdf',
          ),
          prompt]
  )

  return response.text


data1 = generate_data(file1)
data2 = generate_data(file2)
data3 = generate_data(file3)
data4 = generate_data(file4)
data5 = generate_data(file5)
data6 = generate_data(file6)
data7 = generate_data(file7)

all_data = [data1, data2, data3, data4, data5, data6, data7]

with open("extract.csv", "w") as f:
    f.write("Location,Rating,Description\n")
    for line in all_data:
        f.write(line + "\n")


In [None]:
import csv
import pandas as pd
import requests
import time
from tqdm import tqdm

API_KEY = userdata.get('GEOCODING_API_KEY')
INPUT_CSV = "extract.csv"
OUTPUT_CSV = "earthquake_data.csv"


locations = []
extra_data = []

with open(INPUT_CSV, newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if row:
            locations.append(row[0])
            extra_data.append(row[1:] if len(row) > 1 else [])


def geocode(address):
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"address": address, "key": API_KEY}
    try:
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        if data["status"] == "OK":
            loc = data["results"][0]["geometry"]["location"]
            return loc["lat"], loc["lng"]
    except:
        pass
    return None, None



latitudes = []
longitudes = []

for address in tqdm(locations):
    lat, lon = geocode(address)
    latitudes.append(lat)
    longitudes.append(lon)
    #time.sleep(0.01)

combined_data = []
for i in range(len(locations)):
    row = [locations[i], latitudes[i], longitudes[i]] + extra_data[i]
    combined_data.append(row)

combined_header = ["Location", "Latitude", "Longitude"] + header[1:]

with open(OUTPUT_CSV, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(combined_header)
    writer.writerows(combined_data)

print(f"CSV saved to {OUTPUT_CSV}")
