# GROBID

In [1]:
# !pip install pygrobid

### Cloned grobid python client
`git clone https://github.com/kermitt2/grobid_client_python`<br>
`cd grobid_client_python`<br>
`python3 setup.py install`

In [2]:
! pwd

/Users/sayalidalvi/Documents/Big_data/Assignment_2/Case_Study_2/code


In [3]:
%cd ../grobid_client_python

/Users/sayalidalvi/Documents/Big_data/Assignment_2/Case_Study_2/grobid_client_python


In [4]:
! pwd

/Users/sayalidalvi/Documents/Big_data/Assignment_2/Case_Study_2/grobid_client_python


Start grobid server on docker

### Verifying the installation

In [5]:
!python3 -m grobid_client.grobid_client 

usage: grobid_client.py [-h] [--input INPUT] [--output OUTPUT]
                        [--config CONFIG] [--n N] [--generateIDs]
                        [--consolidate_header] [--consolidate_citations]
                        [--include_raw_citations] [--include_raw_affiliations]
                        [--force] [--teiCoordinates] [--segmentSentences]
                        [--verbose]
                        service
grobid_client.py: error: the following arguments are required: service


### Using Grobid Python Client

In [7]:
from grobid_client.grobid_client import GrobidClient

client = GrobidClient(config_path="./config.json")
client.process("processFulltextDocument", "./resources/test_pdf",
                   output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True)

GROBID server is up and running


Pros:
- Processes all the documents under test_pdf folder
- Saves all of them to the output directory

Limitations:
- Saves in .xml files, we need .txt files
- Do not return the xml in the program, we cannot perform any extra processing.

### Using Grobid web service API

In [8]:
# ! curl -v --form input=@./thefile.pdf localhost:8070/api/processFulltextDocument

In [9]:
import requests

def extract_grobid_api(file_name, file_path):
    url = "http://localhost:8070/api/processFulltextDocument"

    files = {'input': (file_name, open(file_path, 'rb'))}

    # Make the POST request with the file
    response = requests.post(url, files=files)
    result = None

    if response.status_code == 200:
        print("POST request successful!")
#         print(response.text)
        result = response.text
    else:
        print(f"POST request failed with status code {response.status_code}")
        print("Response:")
        print(response.text)
        
    return result


In [10]:
import os

pdf_directory = "../data"  
output_dir = "../sample_output/Grobid"

# Iterate through all PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file_path = os.path.join(pdf_directory, filename)
        print("Parsing file ",filename, " saved at path ", pdf_file_path)
        pdf_content = extract_grobid_api(filename, pdf_file_path)
        
        # save to text file
        year = filename.split("-")[0]
        level = filename.split("-")[1]
        new_name = "Grobid_RR_"+year+"_"+level+"_combined.txt"
        if pdf_content:
            output_file_path = os.path.join(output_dir, new_name)
            print("Saving grobid txt file ",new_name, " at path ", output_file_path)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(pdf_content)
                print("Saved the txt file to "+output_file_path)
        else:
            print("No content for this file")
        

Parsing file  2024-l3-topics-combined-2.pdf  saved at path  ../data/2024-l3-topics-combined-2.pdf
POST request successful!
Saving grobid txt file  Grobid_RR_2024_l3_combined.txt  at path  ../sample_output/Grobid/Grobid_RR_2024_l3_combined.txt
Saved the txt file to ../sample_output/Grobid/Grobid_RR_2024_l3_combined.txt
Parsing file  2024-l1-topics-combined-2.pdf  saved at path  ../data/2024-l1-topics-combined-2.pdf
POST request successful!
Saving grobid txt file  Grobid_RR_2024_l1_combined.txt  at path  ../sample_output/Grobid/Grobid_RR_2024_l1_combined.txt
Saved the txt file to ../sample_output/Grobid/Grobid_RR_2024_l1_combined.txt
Parsing file  2024-l2-topics-combined-2.pdf  saved at path  ../data/2024-l2-topics-combined-2.pdf
POST request successful!
Saving grobid txt file  Grobid_RR_2024_l2_combined.txt  at path  ../sample_output/Grobid/Grobid_RR_2024_l2_combined.txt
Saved the txt file to ../sample_output/Grobid/Grobid_RR_2024_l2_combined.txt
