### We want to get a dataset of all the names as well as their paths

In [51]:
import json
import glob
import pandas as pd
from os import walk
from tqdm.auto import tqdm

tqdm.pandas()

In [52]:
# Get all the files within Data/Base_JSON
# All files and directories ending with .json:

def get_file_dict(data_folder_path):
    
    dic_file = {"title": [],
               "file_path": []}
    
    f = []

    for (dirpath, dirnames, filenames) in walk(data_folder_path):
        f.extend([dirpath + '/' + filename for filename in filenames])

    for file_path in f:

        # Opening JSON file
        f = open(file_path)

        # returns JSON object as a dictionary
        data = json.load(f)
        title = data['title']

        # Getting the title
        dic_file['title'].append(title)
        dic_file['file_path'].append(file_path)

        # Closing file
        f.close()
        
    return pd.DataFrame.from_dict(dic_file)

In [53]:
df_file_path = get_file_dict("./Data/Base_JSON")

In [54]:
df_file_path = df_file_path[df_file_path['title'] != ""]

In [55]:
df_file_path

Unnamed: 0,title,file_path
1,BusTUC -A natural language bus route oracle,./Data/Base_JSON\prefixA\json\A\A00/A00-1001.json
2,Machine Translation of Very Close Languages,./Data/Base_JSON\prefixA\json\A\A00/A00-1002.json
3,Cross-Language Multimedia Information Retrieval,./Data/Base_JSON\prefixA\json\A\A00/A00-1003.json
4,Automatic construction of parallel English-Chi...,./Data/Base_JSON\prefixA\json\A\A00/A00-1004.json
5,PartslD: A Dialogue-Based System for Identifyi...,./Data/Base_JSON\prefixA\json\A\A00/A00-1005.json
...,...,...
73665,ANAPHORA RESOLUTION AS LEXICAL COHESION IDENTI...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1032.json
73666,A Large-Vocabulary Bilingual Speech Recognitio...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1033.json
73667,THE SYNTACTIC PROCESSING OF PARTICLES IN JAPAN...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1034.json
73668,Automatic Selection of Synthesis Units from A ...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1035.json


In [56]:
df_file_path['title'] = df_file_path['title'].str.lower()

In [57]:
df_file_path.to_csv("./Data/Reference/reference_file_path.csv",index = False)

### Given a file path, give me a particular section that was listed

In [37]:
test_path = "./Data/Base_JSON\prefixA\json\A\A00/A00-1001.json"

In [38]:
def get_section(file_path, section_interest):
    
    # Dictionary keeping track of which section
    final_out = {i.lower().title(): "" for i in section_interest}
    
    # Opening JSON file
    f = open(file_path)

    # returns JSON object as a dictionary
    data = json.load(f)

    # Iterating through the json list
    for text_body in data['pdf_parse']['body_text']:
        if text_body["section"].lower() in section_interest:
            final_out[text_body["section"].lower().title()] += text_body["text"]

    # Closing file
    f.close()
    
    return final_out

In [39]:
get_section(test_path, ["introduction", "conclusion"])

{'Introduction': 'A natural language interface to a computer database provides users with the capability of obtaining information stored in the database by querying the system in a natural language (NL) . With a natural language as a means of communication with a computer system, the users can make a question or a statement in the way they normally think about the information being discussed, freeing them from having to know how the computer stores or processes the information.The present implementation represents a a major effort in bringing natural language into practical use. A system is developed that can answer queries about bus routes, stated as natural language texts, and made public through the Internet World Wide Web ( http : //www. idi. ntnu. no/bustuc/).Trondheim is a small city with a university and 140000 inhabitants. Its central bus systems has 42 bus lines, serving 590 stations, with 1900 departures per day (in average). That gives approximately 60000 scheduled bus stati

### Should check across all files are there common sections across all papers

In [45]:
df_file_path["Intro_concl"] = df_file_path['file_path'].progress_apply(get_section, args = [["introduction", "conclusion"]])

100%|██████████████████████████████████████████████████████████████████████████| 70101/70101 [00:39<00:00, 1777.68it/s]


In [46]:
df_file_path

Unnamed: 0,title,file_path,Intro_concl
1,BusTUC -A natural language bus route oracle,./Data/Base_JSON\prefixA\json\A\A00/A00-1001.json,{'Introduction': 'A natural language interface...
2,Machine Translation of Very Close Languages,./Data/Base_JSON\prefixA\json\A\A00/A00-1002.json,{'Introduction': 'Although the field of machin...
3,Cross-Language Multimedia Information Retrieval,./Data/Base_JSON\prefixA\json\A\A00/A00-1003.json,{'Introduction': 'contain strings of keywords....
4,Automatic construction of parallel English-Chi...,./Data/Base_JSON\prefixA\json\A\A00/A00-1004.json,{'Introduction': 'Parallel texts have been use...
5,PartslD: A Dialogue-Based System for Identifyi...,./Data/Base_JSON\prefixA\json\A\A00/A00-1005.json,{'Introduction': 'Currently people deal with c...
...,...,...,...
73665,ANAPHORA RESOLUTION AS LEXICAL COHESION IDENTI...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1032.json,{'Introduction': 'It is realized that pieces o...
73666,A Large-Vocabulary Bilingual Speech Recognitio...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1033.json,"{'Introduction': '', 'Conclusion': ''}"
73667,THE SYNTACTIC PROCESSING OF PARTICLES IN JAPAN...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1034.json,{'Introduction': 'The treatment of particles i...
73668,Automatic Selection of Synthesis Units from A ...,./Data/Base_JSON\prefixY\json\Y\Y99/Y99-1035.json,"{'Introduction': 'In past years, many studies ..."


In [47]:
df2 = pd.json_normalize(df_file_path['Intro_concl'])