## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path


## Load Documents

In [2]:

documents = PyPDFLoader("data_input/practice-standard-project-risk-management.pdf").load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))


Number of chunks =  310


## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(310, 4)


Unnamed: 0,text,source,page,chunk_id
0,Project Management Institute \n PRACTICE STA...,data_input/practice-standard-project-risk-mana...,0,61ed0d748bbc451b936375a1fc234ca5
1,ISBN: 978-1-933890-38-8 \n Published by: \n Pr...,data_input/practice-standard-project-risk-mana...,1,2de4ef4dede34900837faea70fce4425
2,To inquire about discounts for resale or educa...,data_input/practice-standard-project-risk-mana...,1,7a3d2e5ca2364f049292d78f48f9e11e
3,"NOTICE \n The Project Management Institute, I...",data_input/practice-standard-project-risk-mana...,2,11920c4fc51a45818d697c9411f49da3
4,"special, indirect, consequential or compensato...",data_input/practice-standard-project-risk-mana...,2,28dfaaf27c3f48df809fbc4b2430d370


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2ConceptsList
from helpers.df_helpers import concepts2Df

In [5]:
concepts_list = df2ConceptsList(df[10:12])

 [
      {
          "entity": "Plan Risk Management Process",
          "importance": 5,
          "category": "process"
      },
      {
          "entity": "Purpose and Objectives of the Plan Risk Management Process",
          "importance": 4.1,
          "category": "concept"
      },
      {
          "entity": "Critical Success Factors for the Plan Risk Management Process",
          "importance": 4.2,
          "category": "concept"
      },
      {
          "entity": "Identify and Address Barriers to Successful Project Risk Management",
          "importance": 4.2.1,
          "category": "concept"
      },
      {
          "entity": "Project Stakeholders",
          "importance": 22,
          "category": "organisation"
      },
      {
          "entity": "Comply with the Organization’s Objectives, Policies, and Practices",
          "importance": 4.2.3,
          "category": "concept"
      },
      {
          "entity": "Tools and Techniques for the Plan Risk Management 

In [6]:
dfne = concepts2Df(concepts_list)
dfne.head()

Unnamed: 0,entity,importance,category,chunk_id,type
0,project management institute,4,Organisation,d6d60a0a59b748f19fca33c4dfcf26ee,concept
1,practice standard for project risk management,5,Document,d6d60a0a59b748f19fca33c4dfcf26ee,concept
2,identify risks process,5,Process,d6d60a0a59b748f19fca33c4dfcf26ee,concept
3,early identification,4,Concept,d6d60a0a59b748f19fca33c4dfcf26ee,concept
4,iterative identification,3,Concept,d6d60a0a59b748f19fca33c4dfcf26ee,concept


### Write CSV to an output directory

Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of names entities

        df = dataframe of chunks

In [7]:
out_dir = 'data_output'
outputdirectory = Path(f"{out_dir}")
if not os.path.exists(outputdirectory):
   os.makedirs(outputdirectory)
dfne.to_csv(outputdirectory/"concepts.csv", sep="|", index=False)
df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)

## Named Entities from Concepts

**Not using this right now**

Extracting named entities our of concepts. 


In [9]:
# from transformers import pipeline

# ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
# # ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")

# def row2NamedEntities(row):
#     ner_results = ner(row['entity'])
#     metadata = {'chunk_id': row['chunk_id'], 'type': 'entity'}
#     entities = []
#     for result in ner_results:
#         entities = entities + [{'entity': result['word'], 'catetory': result['entity_group'], **metadata}]
        
#     return entities



# def dfText2DfNE(dataframe: pd.DataFrame):
#     ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
#     ## The input dataframe must have a entity and a chunk_id column. 

#     ## 1. Calculate named entities for each row of the dataframe. 
#     results = dataframe.apply(row2NamedEntities, axis=1).reset_index(drop=True)

#     ## Flatten the list of lists to one single list of entities. 
#     entities_list = np.concatenate(results).ravel().tolist()

#     ## Remove all NaN entities
#     entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
#     entities_dataframe = entities_dataframe.dropna(subset=['entity'])

#     ## Count the number of occurances per chunk id
#     # entities_dataframe = entities_dataframe.groupby(['entity', 'category', 'chunk_id']).size().reset_index(name='count')

#     return entities_dataframe

In [5]:

# dataframe_dir = 'OrfPathHealth'
# df_concepts = pd.read_csv(f"./data/output/{dataframe_dir}/concepts.csv", sep="|")

# dfc_split = dfText2DfNE(df_concepts)
# dfc_split

Unnamed: 0,entity,catetory,chunk_id,type
0,Mental Health,MISC,83d4d0367bb0467e811782a4ada3bbb9,entity
1,Health Equity,ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
2,World Health Organization (WHO),ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
3,United Nations (UN),ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
4,Sustainable Development Goals (SDGs),MISC,83d4d0367bb0467e811782a4ada3bbb9,entity
...,...,...,...,...
967,National University of Singapore's Saw Swee Ho...,ORG,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
968,Associate Professor,MISC,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
969,Jeremy Lim,PER,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
970,Janice Tan,PER,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
