In [1]:
from pathlib import Path
import fitz
from tqdm import tqdm
from functions.tools import jsonHandler
import ipywidgets as widgets
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go  
from IPython.display import Markdown

In [2]:
#Pathes
cwd = Path.cwd()
filesPath = cwd / 'files'
pdfPath = cwd / 'pdfs'
b2find = jsonHandler(path=Path(filesPath, 'B2Find_corrected.json'),defaultContent=[])
dewey = jsonHandler(path=Path(filesPath, 'dewey.json'),defaultContent=[])
edocRI = jsonHandler(path=Path(filesPath, 'pdfUrlList.json'),defaultContent=[])
KToutput = jsonHandler(path=Path(filesPath, 'Kairntech.json'),defaultContent=[])
from IPython.display import clear_output

type: Expecting ',' delimiter: line 1 column 1202249541 (char 1202249540)

In [None]:
def calcEverything(value):
    clear_output(wait=True)
    global out
    global fileSelect

    # PDF Picker
    glob = pdfPath.glob('**/*')
    pdfs = [(f.stem,f) for f in glob if f.is_file()]
    fileSelect = widgets.Dropdown(options=pdfs,description='PDFs:',disabled=False,)
    fileSelect.observe(calcEverything)
    
    if value['type'] == 'change' and value['name'] == 'value':
        #Pathes
        # Load PDF, Edoc-Info and Fulltext
        pdf = value.new
        researchInfo = [x for x in edocRI if x['filename'] == pdf.stem][0]
        doc = fitz.open(pdf)  
        pages=[page.get_text("text") for page in doc]
        doctext = " ".join(pages)

        if doctext.strip() == '':
            print(f" {pdf.stem}: No Text in PDF - Pick another one")

        # Output files
        out = {'title':value.new.stem,
                'results':[],
                'probs': {},
                'details':{},
                'modifiyer':
                    {'KeywordHit':[],
                    'DeweyHit':[]},
                'UserKeywords':researchInfo['subjects'],
                'DeweyMainCats':[]}
        
        probDict = {}

        for catID,catInfo in tqdm(b2find.items()):
            probDict[catID] = 0
            # Checking if keywords exist
            if catInfo['keywords_de'] == "-- error --":
                keywords = catInfo['keywords_en']
            elif catInfo['keywords_en'] == "-- error --":
                keywords = catInfo['keywords_de']
            else:
                keywords = catInfo['keywords_de']+catInfo['keywords_en']
            # iterating over the keywords
            for kwDict in keywords:
                kw = kwDict['Keyword']
                prob = kwDict['Wahrscheinlichkeit']
                # perfect match FullText
                pageCount = int(doc.page_count)
                if ' '+kw+' ' in doctext:
                    no = str(doctext).count(kw)
                    # if no*3 >= pageCount: # Counting a word max once per page
                    #     kwCount = pageCount
                    # else:
                    #     kwCount = no
                    kwCount = no
                    probDict[catID] += (float(kwCount)*float(prob) / pageCount)
                    out['details'][kw] = [{'number':kwCount}]
                # Perfect Author Keyword matches
                if kw in researchInfo['subjects']:
                    out['modifiyer']['KeywordHit'].append(kw)
                    probDict[catID] += 10
                # perfect Dewey matches
                deweyCats = [dewey[x[0]] for x in researchInfo['ddc']]
                out['DeweyMainCats'] = deweyCats
                if kw in deweyCats:
                    out['modifiyer']['DeweyHit'].append(kw)
                    probDict[catID] += 2.5

        # cleaning
        p = { x:y for x,y in probDict.items() if y != 0}
        r = [(b2find[k]['name_de'],v) for k,v in p.items() if v > 1]
        out['probs'] = p
        out['results'] = r

        maxProb = max(p.values())
        ymax = maxProb+0.1 if maxProb >= 1 else None
        if ymax != None:
            if ymax*0.3 < 1:
                ymin = 1
            else:
                ymin = ymax*0.3
        else:
            ymin = None

        def plot():
            df = pd.DataFrame([out['probs']]).T.reset_index()
            df = df.rename({'index': "Categories", 0: "Probability"},axis='columns')
            df['CatNames'] = df['Categories'].apply(lambda x: b2find[x]['name_de'])

            title = text=out['title']
            userKeywords = ''
            if out['UserKeywords'] != []:
                userKeywords+= '<br><sup>UserKeywords:</sup>'
            for kw in out['UserKeywords']:
                userKeywords+= '<br><sup>'+kw+'</sup>'
            deweyKeywords = ''
            if out['DeweyMainCats'] != []:
                deweyKeywords+= '<br><sup>DeweyCategories:</sup>'
            for kw in out['DeweyMainCats']:
                deweyKeywords+= '<br><sup>'+kw+'</sup>'
            
            fig = px.bar(df,x='Categories',y='Probability', color="CatNames",width=1600, height=800)

            fig.update_layout(
                title=dict(text=title,
                        font=dict(size=12)),
                        annotations=[
                            go.layout.Annotation(
                                text=userKeywords,
                                align='left',
                                showarrow=False,
                                xref='paper',
                                yref='paper',
                                xanchor= 'right',
                                yanchor='bottom',
                                x=1.2,
                                y=0,
                                bordercolor='white',
                                borderwidth=1,
                                hovertext='User Keywords'
                            ),
                            go.layout.Annotation(
                                text=deweyKeywords,
                                align='left',
                                showarrow=False,
                                xref='paper',
                                yref='paper',
                                xanchor= 'right',
                                yanchor='bottom',
                                x=1.5,
                                y=0,
                                bordercolor='white',
                                borderwidth=1,
                                hovertext='Dewey Categories'
                            )])
            fig.update_layout(yaxis_range=[ymin,ymax])
            return fig
        if ymin != None:
            display(fileSelect)
            plot().show()
        else:
            print('No Results found')
        results = 'Results:  \n\n'
        for x in out['results']:
            results+=x[0]+' '+str(x[1])+'  \n'

        display(Markdown(results))
        display(out)

In [None]:
# PDF Picker
glob = pdfPath.glob('**/*')
pdfs = [(f.stem,f) for f in glob if f.is_file()][:100]
fileSelect = widgets.Dropdown(options=pdfs,description='PDFs:',disabled=False,)
fileSelect.observe(calcEverything)
fileSelect

100%|██████████| 336/336 [00:00<00:00, 371.74it/s]


Dropdown(description='PDFs:', options=(('1-band-1517-1530', WindowsPath('c:/Users/Florian Kotschka/gitlab/bua-…

Results:  

Analytische Chemie, Methodenentwicklung 1.1  
Optik, Quantenoptik Und Physik Der Atome, Moleküle Und Plasmen 1.9250000000000003  


{'title': 'automated-enzymatic-methods-for-creatinine-measurement-with-special-attention-to-bilirubin-interference',
 'results': [('Analytische Chemie, Methodenentwicklung', 1.1),
  ('Optik, Quantenoptik Und Physik Der Atome, Moleküle Und Plasmen',
   1.9250000000000003)],
 'probs': {'b2find#1.8.3': 0.0005,
  'b2find#2.3.2': 0.00175,
  'b2find#3.1.1.01': 0.625,
  'b2find#3.1.2.01': 0.0625,
  'b2find#3.1.2.05': 0.5249999999999999,
  'b2find#3.1.3.02': 0.09375,
  'b2find#3.1.3.05': 0.6000000000000001,
  'b2find#3.2.2.07': 0.40625,
  'b2find#3.2.2.11': 0.27499999999999997,
  'b2find#3.2.2.23': 0.4125,
  'b2find#3.3.1.03': 0.2,
  'b2find#3.3.2.02': 1.0,
  'b2find#4.1.4': 1.1,
  'b2find#4.1.5.02': 0.675,
  'b2find#4.2.2': 1.9250000000000003,
  'b2find#4.2.6.03': 0.05625,
  'b2find#4.4.2.02': 0.10625,
  'b2find#5.2.1.01': 0.10625,
  'b2find#5.3.1.04': 0.7875,
  'b2find#5.5.5': 0.275,
  'b2find#4': 0.43124999999999997,
  'b2find#4.1.5': 0.51,
  'b2find#3.2.2': 0.7125,
  'b2find#5.4': 0.1,
  '

40