# Create CSV files from PDF files

In [5]:
import tabula
import pandas as pd
import glob

def extract_tables_from_pdf(pdf_path, pages):
    # Use tabula to extract tables from specific pages
    tables = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)

    # Assuming the last table on each page is the one you want to extract
    extracted_tables = [table for table in tables]

    return extracted_tables

def save_tables_to_csv(tables, csv_path_prefix):
    for i, table in enumerate(tables):
        csv_path = f"{csv_path_prefix}_page_{i + 1}.csv"
        table.to_csv(csv_path, index=False)
        print(f"Table from page {i + 1} saved to {csv_path}")

In [6]:
files = glob.glob("PDF/*.pdf")
files.sort()
files

['PDF/Gujarat_State_Year_Book_2014-15.pdf',
 'PDF/Gujarat_State_Year_Book_2015-16.pdf',
 'PDF/Gujarat_State_Year_Book_2016-17.pdf',
 'PDF/Gujarat_State_Year_Book_2017-18.pdf',
 'PDF/Gujarat_State_Year_Book_2018-19.pdf',
 'PDF/Gujarat_State_Year_Book_2019-20.pdf',
 'PDF/Gujarat_State_Year_Book_2020-21.pdf',
 'PDF/Gujarat_State_Year_Book_2021-22.pdf']

In [7]:
table_map = {
    '14-15':(182,196),
    '15-16':(191,209),
    '16-17':(120,127),
    '17-18':(155,173),
    '18-19':(138,156),
    '19-20':(140,154),
    '20-21':(141,156),
    '21-22':(136,150)
}

In [8]:
for file, pages in zip(files, table_map.values()):
    extracted_tables = extract_tables_from_pdf(file, [i for i in range(pages[0],pages[1])])
    file_year = file.split('/')[-1].split('.')[0]
    save_tables_to_csv(extracted_tables, f"table-data/output_table_{file_year}")

Error importing jpype dependencies. Fallback to subprocess.
No module named 'jpype'


Table from page 1 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_1.csv
Table from page 2 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_2.csv
Table from page 3 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_3.csv
Table from page 4 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_4.csv
Table from page 5 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_5.csv
Table from page 6 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_6.csv
Table from page 7 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_7.csv
Table from page 8 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_8.csv
Table from page 9 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_9.csv
Table from page 10 saved to table-data/output_table_Gujarat_State_Year_Book_2014-15_page_10.csv
Table from page 11 saved to table-data/output_table_Gujara

# Separate CSV files into separate directories

In [4]:
import glob
import os
import shutil

files = glob.glob("table-data/*.csv")
dir_ls = [2014,2015,2016,2017,2018,2019,2020,2021]

for dir in dir_ls:
    for file in files:
        if (str(dir) in file):
            os.makedirs(f"table-data/{dir}", exist_ok=True)
            shutil.copyfile(file, f"table-data/{dir}/{file.split('/')[-1]}")