# PDF Sea Level Data Extraction
Trích xuất dữ liệu mực nước biển từ PDF Niên giám thống kê (2014-2024) cho 5 trạm: HonDau, HonNgu, QuyNhon, VungTau, DaNang

In [60]:
import pdfplumber, pandas as pd, os, re
from unidecode import unidecode
from pathlib import Path
import warnings; warnings.filterwarnings('ignore')

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(re.sub(r'\btram\b', '', unidecode(text).lower()).split())

def match_station(text, stations):
    norm = normalize_text(text)
    for key, name in stations.items():
        if key in norm or norm in key: return name
    return None

In [61]:
def extract_data(pdf_path, year, stations):
    results, found = [], set()
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            if len(found) == len(set(stations.values())): break
            text = page.extract_text()
            if not text or 'muc nuoc bien' not in normalize_text(text): continue
            
            for line in text.split('\n'):
                parts = line.split()
                if len(parts) < 2: continue
                
                station_parts, values = [], []
                for i, part in enumerate(parts):
                    if i == 0 and part in ['Tram', 'Trạm']: continue
                    try:
                        num = re.sub(r'[^\d.]', '', part)
                        if num and num.count('.') <= 1 and len(num) >= 2:
                            val = float(num)
                            if 50 <= val <= 320: values.append(num); continue
                    except: pass
                    if not values: station_parts.append(part)
                
                if len(values) < 12: continue
                station = match_station(' '.join(station_parts), stations)
                if station and station not in found:
                    found.add(station)
                    months = [float(v) if v else None for v in values[:12]]
                    result = {'Year': year, 'Station': station}
                    for i, m in enumerate(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']):
                        result[m] = months[i] if i < len(months) else None
                    valid = [v for v in months if v]
                    result['Annual_Average'] = round(sum(valid)/len(valid), 2) if valid else None
                    results.append(result)
    return results

In [62]:
stations = {'hon dau':'HonDau', 'hon ngu':'HonNgu', 'quy nhon':'QuyNhon', 
            'vung tau':'VungTau', 'da nang':'DaNang', 'son tra':'DaNang'}

pdfs = {2014:'Nien-giam-2014-pdf.pdf', 2015:'Nien-giam-Thong-ke-2015-1.pdf',
        2016:'Nien-giam-Thong-ke-2016.pdf', 2017:'Nien-giam-2017-pdf.pdf',
        2018:'Nien-giam-2018.pdf', 2019:'Nien-giam-thong-ke-day-du-2019.pdf',
        2020:'Sach-NGTK-2020Ban-quyen.pdf', 2021:'Sach-Nien-giam-TK-2021-1.pdf',
        2022:'Sach-Nien-giam-TK-2022-final.pdf', 2023:'NIEN-GIAM-THONG-KE-2023_Ban-quyen-1.pdf',
        2024:'NGTK-Cuc-TK-2024_BQ.pdf'}

In [63]:
all_data = []
for year, file in sorted(pdfs.items()):
    path = Path('PDF_files') / file
    if path.exists():
        data = extract_data(path, year, stations)
        all_data.extend(data)
        print(f"{year}: {len(data)} records")
print(f"Total: {len(all_data)} records")

2014: 5 records
2015: 5 records
2016: 0 records
2017: 5 records
2018: 5 records
2019: 5 records
2020: 5 records
2021: 5 records
2022: 5 records
2023: 5 records
2024: 4 records
Total: 49 records


In [64]:
df = pd.DataFrame(all_data).drop_duplicates(subset=['Year','Station']).sort_values(['Year','Station'])
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
df = df[['Year','Station'] + months + ['Annual_Average']]
print(f"\n{len(df)} records: {sorted(df.Year.unique())}")
print(f"Stations: {sorted(df.Station.unique())}\n")
display(df.head(20))


49 records: [np.int64(2014), np.int64(2015), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Stations: ['DaNang', 'HonDau', 'HonNgu', 'QuyNhon', 'VungTau']



Unnamed: 0,Year,Station,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual_Average
2,2014,DaNang,99.0,98.0,92.0,87.0,84.0,85.0,86.0,84.0,98.0,122.0,119.0,122.0,98.0
0,2014,HonDau,191.0,194.0,190.0,194.0,192.0,194.0,192.0,190.0,206.0,224.0,215.0,207.0,199.08
1,2014,HonNgu,152.0,166.0,163.0,164.0,160.0,157.0,159.0,156.0,172.0,197.0,189.0,182.0,168.08
3,2014,QuyNhon,159.0,157.0,153.0,152.0,145.0,145.0,146.0,143.0,154.0,172.0,173.0,179.0,156.5
4,2014,VungTau,281.0,276.0,271.0,264.0,257.0,245.0,243.0,247.0,252.0,278.0,285.0,292.0,265.92
7,2015,DaNang,101.0,94.0,90.0,89.0,80.0,77.0,83.0,89.0,102.0,111.0,111.0,114.0,95.08
5,2015,HonDau,195.0,193.0,195.0,193.0,194.0,192.0,190.0,202.0,211.0,210.0,213.0,204.0,199.33
6,2015,HonNgu,166.0,161.0,163.0,159.0,154.0,151.0,155.0,162.0,181.0,188.0,192.0,188.0,168.33
8,2015,QuyNhon,161.0,153.0,148.0,146.0,138.0,136.0,139.0,148.0,158.0,165.0,168.0,171.0,152.58
9,2015,VungTau,280.0,271.0,264.0,263.0,248.0,246.0,242.0,245.0,256.0,278.0,284.0,289.0,263.83


In [65]:
df.to_csv('sea_level_data_2013-2024.csv', index=False, encoding='utf-8-sig')
df.to_excel('sea_level_data_2013-2024.xlsx', index=False)
print("✓ Exported to CSV & Excel")

✓ Exported to CSV & Excel
