In [1]:
import docx
import pandas
import spacy

def get_text_from_docx(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)


In [4]:
# initialize language model
nlp = spacy.load("en_core_web_md")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

def extract_entities(text, tagged_entities=None):        
    doc = nlp(text)
    ents = []

    # iterates over sentences and extracted linked entities
    for sent in doc.sents:
        for ent in sent._.linkedEntities:
            ents.append({
                'text': ent.get_span().text,
                'label': ent.label,
                'description': ent.description,
                'wikidata_id': ent.get_url().split('/')[-1],
                'superclasses': [{
                    'wikidata_id': ent.get_url().split('/')[-1], 
                    'label': ent.label
                } for ent in ent.get_super_entities()]
            })
    if tagged_entities is not None:
        ents = pandas.DataFrame(ents).join(tagged_entities, on='wikidata_id').dropna(subset=['seTag'])
        ents = ents.drop_duplicates(subset=['text', 'wikidata_id'])
    return ents

In [5]:
text = get_text_from_docx('RAD SOO for FY23 - Example SOO Only.docx')

In [28]:
print(text[:1000])















Simplified & Guided Security Planning for Rapid ATO
Statement of Objective (SOO)

Version 2.0 
3/21/2022












INFORMATION NOT RELEASABLE TO THE PUBLIC UNLESS AUTHORIZED BY LAW:
This information has not been publicly disclosed and may be privileged and confidential. It is for internal government use only and must not be disseminated, distributed, or copied to persons not authorized to receive the information. Unauthorized disclosure may result in prosecution to the full extent of the law.



Table of Contents
Table of Contents	1
Purpose	2
Outputs	2
Problem Statement	3
Scope	3
CMS Blueprint Digital Service (Blueprint)	3
CMS Reusable Compliance Library	4
Modern Web Content:	4
Period and Place of Performance	4
Required Skills	4
Key Personnel	5
Security Requirements	5
Background	6
Objectives	7
Tasks	7
      Task 1 CMS Reusable Compliance Library	7
1a. Build Artificial Intelligence pipelines and Services	7
Candidate Components Identification Pipeline (also known as AI Pi

In [12]:
software_categories = pandas.read_csv('../ontology_building/wd_software_categories.csv')

In [16]:
software_categories['item'] = software_categories['item'].str.split('/').apply(lambda x: x[-1])

In [17]:
software_categories

Unnamed: 0,item,itemLabel
0,Q341,free software
1,Q485,computer virus
2,Q6368,web browser
3,Q7889,video game
4,Q9135,operating system
...,...,...
379,Q113723152,log viewer
380,Q113724486,screenshot software
381,Q114210323,Nintendo Switch emulator
382,Q114461696,cave surveying software


In [18]:
entities = extract_entities(' '.join(text.split()))

In [19]:
entities = pandas.DataFrame(entities)

In [26]:
entities[entities['superclasses'].apply(
    lambda x: any([sup['wikidata_id'] in software_categories['item'].tolist() for sup in x]))]

Unnamed: 0,text,label,description,wikidata_id,superclasses
55,Project,Microsoft Project,Project management software,Q80336,"[{'wikidata_id': 'Q167035', 'label': 'project ..."
162,terms,terminal emulator,program that emulates a video terminal,Q1071233,"[{'wikidata_id': 'Q17155032', 'label': 'softwa..."
190,terms,terminal emulator,program that emulates a video terminal,Q1071233,"[{'wikidata_id': 'Q17155032', 'label': 'softwa..."
432,Kubernetes,Kubernetes,software to manage containers on a server-clus...,Q22661306,"[{'wikidata_id': 'Q341', 'label': 'free softwa..."
462,Processing,Processing,programming language,Q1053535,"[{'wikidata_id': 'Q9143', 'label': 'programmin..."
486,Confluence,Confluence,A collaboration software program,Q1125400,"[{'wikidata_id': 'Q474157', 'label': 'collabor..."
487,Jira,Jira,issue-tracking product developed by Atlassian,Q1359246,"[{'wikidata_id': 'Q167035', 'label': 'project ..."
488,Slack,Slack,messaging application,Q17130715,"[{'wikidata_id': 'Q35127', 'label': 'website'}..."
511,Project Manager,ProjectManager.com,Project Management Software,Q7248945,"[{'wikidata_id': 'Q167035', 'label': 'project ..."
852,Control,Control,2019 action-adventure video game developed by ...,Q54935655,"[{'wikidata_id': 'Q7889', 'label': 'video game'}]"


In [36]:
entities[entities['text'].str.contains('AWS')]

Unnamed: 0,text,label,description,wikidata_id,superclasses
431,AWS,Amazon Web Services,subsidiary of Amazon that provides on-demand c...,Q456157,"[{'wikidata_id': 'Q658255', 'label': 'subsidia..."
1147,AWS,Amazon Web Services,subsidiary of Amazon that provides on-demand c...,Q456157,"[{'wikidata_id': 'Q658255', 'label': 'subsidia..."
