In [1]:
import tabula
import pandas as pd
from multiprocessing import Pool

In [2]:
def process_page(page_number):
    # Prints the page number without starting a new line to show progress
    print(page_number, end=' ', flush=True)
    dfs = tabula.read_pdf(path, pages=str(page_number), stream=True, silent=True)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    return None

In [3]:
# if __name__ == '__main__':
#     print("Processing pages: ", end='', flush=True)
#     with Pool() as p:
#         results = p.map(process_page, pages_to_process)
    
#     valid_dataframes = [df for df in results if df is not None]

#     if valid_dataframes:
#         final_df = pd.concat(valid_dataframes, ignore_index=True)
#         final_df.to_csv('output_parallel.csv', index=False)
#         # Adds a final confirmation message
#         print("\n\nDone. CSV file created.")
#     else:
#         print("\n\nDone. No tables were found.")

In [6]:
import pdfplumber
import pandas as pd
import re

pdf_path = "data/pre_monsoon_2014_2024_split/pre_monsoon_2014_2024-1001-2000.pdf"

output_csv_path = r"D:/pdftoexcel/data/pre_monsoon_2014_2024_split/csv_files/extracted_data_1001_2000.csv"

extracted_data = []

# Using a slightly more robust regex to handle different number formats
line_pattern = re.compile(r'(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d{2}-\d{2}-\d{2})\s+([\d\.]+)$')

# --- FIX: Define line_buffer OUTSIDE the page loop ---
line_buffer = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue

        raw_lines = text.split('\n')
        processed_lines = []
        
        for line in raw_lines:
            line_buffer += " " + line.strip()
            if line_pattern.search(line_buffer):
                processed_lines.append(line_buffer.strip())
                line_buffer = "" # Reset buffer ONLY after a successful record is found

        for line in processed_lines:
            match = line_pattern.search(line)
            if match:
                latitude, longitude, date, wl = match.groups()
                location_info = line[:match.start()].strip()
                
                extracted_data.append({
                    'Location_Info': location_info,
                    'Latitude': latitude,
                    'Longitude': longitude,
                    'Date': date,
                    'WL(mbgl)': wl
                })

if extracted_data:
    df = pd.DataFrame(extracted_data)
    df.to_csv(output_csv_path, index=False)

In [13]:
import pdfplumber
import pandas as pd
import re
import os
import glob
from tqdm import tqdm

input_folder = r"D:/pdftoexcel/data/pre_monsoon_2014_2024_split"
output_csv_path = r"D:/pdftoexcel/data/pre_monsoon_2014_2024_split/csv_files/combined_extracted_data"

all_extracted_data = []
line_pattern = re.compile(r'(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d{2}-\d{2}-\d{2})\s+([\d\.]+)$')

pdf_files = glob.glob(os.path.join(input_folder, "*.pdf"))

if not pdf_files:
    print(f"No PDF files found in the specified folder: {input_folder}")
else:
    print(f"Found {len(pdf_files)} PDF files to process...")
    n = len(pdf_files)
    k = 1

    for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            extracted_data = []
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    line_buffer = ""
                    text = page.extract_text()
                    if not text:
                        continue

                    raw_lines = text.split('\n')
                    processed_lines = []
                    
                    for line in raw_lines:
                        line_buffer += " " + line.strip()
                        if line_pattern.search(line_buffer):
                            processed_lines.append(line_buffer.strip())
                            line_buffer = ""

                    for line in processed_lines:
                        match = line_pattern.search(line)
                        if match:
                            latitude, longitude, date, wl = match.groups()
                            location_info = line[:match.start()].strip()
                            
                            extracted_data.append({
                                'Location_Info': location_info,
                                'Latitude': float(latitude),
                                'Longitude': float(longitude),
                                'Date': date,
                                'WL(mbgl)': float(wl)
                            })
            if k <= n:
                new_output_csv_path = output_csv_path + f"{k}" + ".csv"
                k = k + 1
            if extracted_data:
                df = pd.DataFrame(extracted_data)
                df.to_csv(new_output_csv_path, index=False)
            else:
                print("No data could be extracted from any of the PDF files.")
        except Exception as e:
            print(f"\nAn error occurred while processing {os.path.basename(pdf_path)}: {e}")
            continue

    # if all_extracted_data:
    #     print(f"\nSuccessfully extracted {len(all_extracted_data)} records from all PDFs.")
    #     df = pd.DataFrame(all_extracted_data)
        
    #     output_dir = os.path.dirname(output_csv_path)
    #     os.makedirs(output_dir, exist_ok=True)
        
    #     df.to_csv(output_csv_path, index=False)
    #     print(f"✅ All data has been saved to: {output_csv_path}")


Found 5 PDF files to process...


Processing PDFs: 100%|██████████████████████████████████████████████████████████████████| 5/5 [46:37<00:00, 559.47s/it]


In [1]:
import pandas as pd
import glob
import os

folder_path = r'D:/pdftoexcel/data/pre_monsoon_2004_2013_split/csv_files'
output_file = r'D:/pdftoexcel/data/pre_monsoon_2004_2013_split/csv_files/combined_data.csv'

all_files = glob.glob(os.path.join(folder_path, "*.csv"))

df_list = []
for filename in all_files:
    try:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)
    except pd.errors.EmptyDataError:
        print(f"Warning: Skipping empty file: {os.path.basename(filename)}")
        continue

if df_list:
    combined_df = pd.concat(df_list, axis=0, ignore_index=True)
    combined_df.to_csv(output_file, index=False)
    print(f"Successfully combined {len(df_list)} non-empty files into {output_file}")
else:
    print(f"No CSV files with data were found in the specified folder: {folder_path}")

Successfully combined 4 non-empty files into D:/pdftoexcel/data/pre_monsoon_2004_2013_split/csv_files/combined_data.csv


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('data/pre_monsoon_2014_2024_split/csv_files/combined_extracted_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196930 entries, 0 to 196929
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location_Info  196930 non-null  object 
 1   Latitude       196930 non-null  float64
 2   Longitude      196930 non-null  float64
 3   Date           196930 non-null  object 
 4   WL(mbgl)       196930 non-null  float64
dtypes: float64(3), object(2)
memory usage: 7.5+ MB


In [3]:
df.isnull().sum()

Location_Info    0
Latitude         0
Longitude        0
Date             0
WL(mbgl)         0
dtype: int64

In [4]:
df.drop(inplace = True, columns = ['Location_Info'])
df['date'] = pd.to_datetime(df['Date'], dayfirst = True)
df.drop(columns = ['Date'], inplace = True)
df['date'] = df['date'].astype(int)

In [5]:
df.head()

Unnamed: 0,Latitude,Longitude,Date,WL(mbgl)
0,13.27653,93.02364,10-04-24,1.73
1,13.27653,93.02364,30-04-23,1.93
2,13.27653,93.02364,30-04-22,1.13
3,13.27653,93.02364,30-04-19,0.49
4,13.27653,93.02364,30-04-18,0.47


In [6]:
print(df.dtypes)

Latitude     float64
Longitude    float64
Date          object
WL(mbgl)     float64
dtype: object


  df['date'] = pd.to_datetime(df['Date'], dayfirst = True)


In [8]:
df.head()

Unnamed: 0,Latitude,Longitude,WL(mbgl),date
0,13.27653,93.02364,1.73,1712707200000000000
1,13.27653,93.02364,1.93,1682812800000000000
2,13.27653,93.02364,1.13,1651276800000000000
3,13.27653,93.02364,0.49,1556582400000000000
4,13.27653,93.02364,0.47,1525046400000000000


In [9]:
numcol = ['Latitude', 'Longitude', 'date']

In [10]:
y = df['WL(mbgl)']
df.drop(columns = ['WL(mbgl)'], inplace = True)
X = df

In [11]:
transformer = ColumnTransformer(
    transformers = [
        ('SS', StandardScaler(), numcol)
    ], remainder = 'passthrough'
)

In [12]:
X = transformer.fit_transform(X)

In [13]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2
0,-1.246681,2.67914,1.459688
1,-1.246681,2.67914,1.170517
2,-1.246681,2.67914,0.865467
3,-1.246681,2.67914,-0.050519
4,-1.246681,2.67914,-0.355569
