In [31]:
!pip install torch torchvision torchaudio 



In [9]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import json
import ollama.client as client



splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

In [10]:
from transformers import pipeline

## Roberta based NER

ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
# ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")


print("Number of parameters ->", ner.model.num_parameters()/1000000, "Mn")


Number of parameters -> 277.456901 Mn


In [11]:
def row2NamedEntities(row):
    # print(row)
    ner_results = ner(row['text'])
    metadata = {'chunk_id': row['chunk_id']}
    entities = []
    for result in ner_results:
        entities = entities + [{'name': result['word'], 'entity': result['entity_group'], **metadata}]
        
    return entities

def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [12]:
loader = PyPDFLoader("./data/1100IJIMHS_20_98-110STUDYOFYOGAEFFECTSONHEALTH.pdf")
# loader = PyPDFDirectoryLoader("./data/kesy1dd")

pages = loader.load_and_split(text_splitter=splitter)
len(pages)


63

In [13]:

rows = []
for page in pages:
    row = {'text': page.page_content, **page.metadata, 'chunk_id': uuid.uuid4().hex}
    rows += [row]

df = pd.DataFrame(rows)


In [14]:
dfne = dfText2DfNE(df)


In [15]:
df_ne = dfne.groupby(['name', 'entity']).agg({'count': 'sum', 'chunk_id': ','.join}).reset_index()
df_ne.sort_values(by='count', ascending=False).head(100).reset_index()

Unnamed: 0,index,name,entity,count,chunk_id
0,229,Yoga,MISC,49,"045b92114cc34b53a53e3c0c22059f20,07fed70315b14..."
1,71,International Journal of Innovative Medicine a...,ORG,13,"045b92114cc34b53a53e3c0c22059f20,0f10b47b63ef4..."
2,222,Whites Science Innovation Ltd,ORG,12,"045b92114cc34b53a53e3c0c22059f20,0f10b47b63ef4..."
3,221,WHO,ORG,6,"0539c099185f4877b26322e5e9302ba1,07fed70315b14..."
4,133,Patanjali,PER,5,"0e0b2bce112b40f280ad1e718a4656ce,635a00eb33904..."
...,...,...,...,...,...
95,264,ed Complementary and Alternative Medicine,ORG,1,ab54d1c0f3c6452aa7957b2bae36998c
96,265,gmail,ORG,1,c4e5eadb60804d0ca6a57ff2552f406c
97,266,ions,ORG,1,c17dabccf0854d429d1422cc63627e5c
98,267,lam,MISC,1,7b48adc1b1d74049943ed89215e2c3a3


In [16]:
pages[12].page_content

'ISSN2056 -9866  \nCopyright © 2020 Whites Science Innovation Ltd. All rights reserved . International Journal of Innovative Medicine and He alth Science, Volume 12, 2020, 98-110 \n \n 100 In action, Yoga is a special skill which makes the mind reaches  its subtler state: “Yogah karmasu kaushalam”  \n[3] (Geeta  2.50). Yoga is dexterity in action. The dexterity is in maintaining re laxation and awareness in \naction. Relaxed action is the process. Efficiency in action is an outcome. Thus, Yoga is a skilful science of \ngaining mastery over the mid. Yoga is normally and traditionally conjectured and popularly known as a \nprocess or a techn ique to reach the ultimate state of perfection. However, yoga is found defined even as'

In [42]:


def extractConcepts(prompt: str, model='mistral-openorca:latest'):
    SYS_PROMPT = (
        "Your task is to extract the key entities mentioned in the users input.\n"
        "Entities may include - event, concept, person, place, object, document, organisation, artifact, misc, etc.\n"
        "Format your output as a list of json with the following structure.\n"
        "[{\n"
        "   \"entity\": The Entity string\n"
        "   \"importance\": How important is the entity given the context on a scale of 1 to 5, 5 being the highest.\n"
        "   \"type\": Type of entity\n"
        "}, { }]"
    )
    response, context = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
    return json.loads(response)


In [30]:
res = extractConcepts(prompt = pages[22].page_content)

An error occurred: 404 Client Error: Not Found for url: http://localhost:11434/api/generate


TypeError: the JSON object must be str, bytes or bytearray, not NoneType

NameError: name 'res' is not defined