***
***DEPENDENCIES***
***

In [None]:
# https://spacy.io/usage/v3
import spacy
import json
import random
from spacy import displacy
from IPython.display import HTML as html_print
import pprint
import requests
!python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 15000;

***
***
***

***
***PROPERTY MINING CODE***
***

***
***HELPERS***
***

In [None]:
def visualize(doc):
    # https://spacy.io/api/top-level#displacy_options
    options= {
        "compact":True,
        "distance":100,
    }
    displacy.render(doc, style='dep',jupyter=True,options=options)    
    
def describe(sentence):
    doc = nlp((sentence))
    visualize(doc)
    print(json.dumps(doc.to_json(), indent=1))    
    
def first_or_none(lst):
    return None if len(lst) == 0 else lst[0]    

def flatten(lst):
    return [item for sublist in lst for item in sublist]

***
***CONSTANTS***
***

In [None]:
COPULAS = [
    "is",
    "are",
    "were"
]

VALID_PREDICATE_K_V_PAIRS = [
    ("dep","acomp"),
    ("dep","amod"),
    ("dep","attr")
]

VERB_TAGS = [
    "AUX",
    "VERB"
]


WORDCRAFT_WORDS = "Mediterranean,abdicate,abhor,abhorrent,abstract,acanthocephalan,accredit,accreditation,acephalous,acholia,achromatic,acousticophobe,acousticophobia,acquire,acquisition,acrokinesia,acrolect,acrolith,acronym,acropathy,acrophobe,acrophobia,acuity,acumen,acupuncture,acupuncturist,acute,adenitis,adenoma,aerobic,aerolith,aeronaut,aeronautical,aeronautics,aeropathy,aerophobe,aerophobia,aestivation,affluence,aggravate,agnostic,agnosticism,agoraphobe,agoraphobia,albedo,albinism,albino,album,albumen,algophobe,algophobia,alienable,allegory,allergen,allergic,allergy,alleviate,alliteration,allonym,allopatric,altimeter,ambidextrous,amnesia,amorphous,anaerobic,analgesic,anarchic,anarchy,android,anesthetic,angiopathy,anhedonia,animal,animate,animation,animism,animosity,anisogamy,annihilate,annihilation,anniversary,annual,annually,annull,anonymous,anosmia,antebellum,antediluvian,anthophile,anthophilia,anthozoa,anthracemia,anthropogeny,anthropoid,anthropology,anthropophobe,anthropophobia,anthroposophical,anthroposophist,anthroposophy,antifungal,antigen,antineutrino,antipsychotic,antonym,apathy,apiary,apiculture,apnea,apocarps,apocarpy,appendage,appendices,appendicitis,appendix,apteral,apterous,apteryx,aquaculture,aquanaut,aquifer,arachnophobe,arachnophobia,arboriculture,archaeoastronomy,archaeology,archaic,archosaur,argonaut,aristocracy,armadillo,arteriopathy,arthralgia,arthritis,arthropod,arthroscope,artiodactyl,assimilate,asterisk,asteroid,astral,astrobiology,astrocyte,astrolabe,astrologer,astrology,astronaut,astronomer,astronomy,asymmetry,asymptote,athlete,athletic,atom,atrophy,audible,audience,audiogenic,audiogram,audiogram,audiologist,audiology,audiometer,audiometry,audiopathy,audiophile,audiovisual,auditorium,autocracy,autocrat,autograph,autohypnosis,autohypnotic,automatic,automobile,autonomous,autophagy,autotomy,autotroph,avian,aviary,aviation,aviator,aviatrix,aviculture,bacteriacide,bacteriology,bankrupt,baritone,barometer,basilect,batholith,bathometer,bathynaut,belligerent,benediction,benefactor,beneficial,beneficiary,benefit,benevolence,benevolent,benign,benthic,biannual,biathlete,biathlon,bibliographer,bibliography,bibliology,bibliomancy,bibliomania,bibliomaniac,bibliophile,bibliophilia,bibliophobe,bibliophobia,bibliotherapy,bicentennial,bicephalous,biceps,bicycle,bident,biennial,bilateral,bilingual,bilith,bimeter,binary,binomial,biocide,biology,biome,biomimetic,bioscope,biped,bipod,bisect,bisector,brachialgia,brachiopod,brachycephalic,brachypterous,bradycardia,breviary,brevity,bryophyte,bryozoa,bureaucracy,bursitis,cacodemon,cacography,cacophonous,cacophony,calligrapher,calligraphy,canine,capital,captivating,captivity,capture,cardiac,cardinal,cardiogenic,cardiogram,cardiograph,cardioid,cardiologist,cardiology,cardiopathy,carnivore,carnivorous,carpophore,carpospore,cavity,celestial,celestine,cellulitis,centennial,centimeter,centipede,centrifuge,centroid,century,cephalalgia,cephalic,cephalization,cephalochordate,cephalometry,cephalopod,cephalothorax,cerebropathy,cessation,cetology,chaeta,chela,chelate,chelicera,cheliped,chemolithoautotroph,chemotroph,chiliad,chiliagon,chiliarch,chiliasm,chiliast,chilitis,chiral,chirality,chirographer,chirography,chiromancer,chiromancy,chiropodist,chiropractor,chiropteran,chirotherium,chlorosis,chondroma,chondropathy,chord,chordate,chromatophore,chromium,chromogen,chromophobe,chromosome,chromosphere,chronic,chronolect,chronology,chronometer,chronophobe,chronophobia,chrysalis,chrysanthemum,chrysolite,chrysotherapy,circuit,circuitous,circular,circumambulate,circumference,circumflex,circumfluous,circumlocution,circumscribe,circumscribed,circumspect,circumstances,circumstantial,circumstellar,circumvent,circus,cirrhosis,civic,civil,civility,civilization,civilize,clamor,clarification,clarify,clarity,claustrophobe,claustrophobia,cnidarian,codependency,codependent,coleoptera,colitis,colloquial,colloquium,colotomy,comet,commensalism,complacency,complacent,compunction,concave,confluence,congregate,congregation,congruent,conifer,conscience,conscientious,conscious,consciousness,constellation,contact,contagious,contemporaneous,contemporaneously,contemporary,contemporize,contiguous,contingency,contingent,contort,contortion,contradict,contradiction,contradictory,convex,coronation,coronavirus,cosmetics,cosmetology,cosmologist,cosmology,cosmonaut,cosmopolis,cosmopolitan,craniectomy,credence,credential,credible,credit,creditor,credo,credulity,creed,crocodile,crucifer,cruciform,crypsis,cryptic,cryptogenic,cryptogram,cryptographer,cryptography,cryptolect,cryptology,cryptonym,cryptozoa,cryptozoologist,cryptozoology,cuboid,culpable,cuneiform,cyanosis,cyclamen,cyclical,cyclist,cycloid,cyclometer,cyclone,cyclops,cyclorama,cynic,cynic,cynical,cynical,cynocephalic,cynocephalus,cynocephaly,cynomorpha,cynophile,cynophilia,cynophobe,cynophobia,cystitis,cystocarp,cystolith,cytopathy,cytophotometry,cytoplasm,cytosis,dactyl,dactylic,dactylitis,dactylography,dactylology,dactylomegaly,dactylus,decade,decagon,decahedron,decaliter,decalogue,decapod,decathlete,decathlon,december,decet,decibel,decigram,decimal,decimate,decimeter,declaration,decrypt,demagogue,democracy,democratic,demography,demonology,demotic,dendriform,dendrite,dendritic,dendrochronologist,dendrochronology,dendrogram,dendroid,dendrology,denticle,dentifrice,dentition,dentoid,dentophobe,dentophobia,denture,depend,dependability,dependable,dependence,dependency,dependent,dermatitis,dermatoglyph,dermatologist,dermatology,dermatopathy,dermatophyte,dermatotrophy,dermoptera,detritivore,detritivorous,devastate,devastation,dexterity,dexterous,dextral,dextrocardia,diagnose,diagnosis,diagnostician,diagonal,dialect,diatom,dictate,dictator,diction,dictum,didactyl,diffraction,diffractometer,digit,digital,digitigrade,dignity,dipsomania,dipsosis,diptera,dipteral,dipteran,disaster,discord,discredit,disruption,dissimilar,distortion,diurnal,dodecagon,dodecahedron,dodecaphonic,dolichocephalic,dolphin,domestication,dormancy,dormant,dormant,dormitory,dormouse,dorsal,dubious,duet,duplicate,dyspnea,dystopia,eccentric,eccentricity,echinoderm,ecocide,ecolect,ectotherm,egocentric,egomania,egotistical,egregious,electroencephalogram,electroencephalographer,electroscope,elevation,elevator,eloquence,eloquent,elucidate,emigrant,emigrate,emigration,emotion,emotional,encephalitis,encephalitogen,encephalitogenic,encephalopathy,enchiridion,encrypt,encryption,encyclical,encyclopedia,encyclopedic,endocardium,endocarp,endocrinosis,endodontist,endolithic,endoparasite,endoparasitic,endoscope,endoskeleton,endosymbiosis,endotherm,entomology,eolithic,epicarp,epidemic,epidemiology,epidermis,epilithic,equality,equanimity,equator,equilateral,equinoctial,equinophile,equinophilia,equinophobe,equinophobia,equinox,eruption,eschatology,estivation,ethnocentrism,ethnolect,eukaryote,eulogy,euphony,eupnea,euthanasia,eutrophication,exculpate,exhalation,exobiology,exodontist,exodus,exonerate,exophthalmic,exorbitant,exoskeleton,expatriate,expedient,extemporaneous,extemporaneously,extortion,extragalactic,extraneous,extraterrestrial,extricate,extrovert,facsimile,factoid,ferret,ferry,finite,flora,floriation,floriculture,florid,floriferous,florigenic,florist,flu,fluctuate,fluctuation,fluency,fluent,fluid,fluidity,flux,folivore,foraminafer,fortuitous,fortuity,fortunate,fortune,fossiliferous,fractal,fraction,fractionation,fractious,fragile,fragment,fragmentary,fragmentation,fructify,fructivore,fructose,fungal,fungicide,fungiform,fungistatic,fungivore,fungoid,fusiform,galactagogue,galactic,galactic,galactose,galaxy,galaxy,gamete,gastric,gastritis,gastrolith,gastronome,gastronomy,gastropathy,gastropod,gastroscope,genome,genotype,geocentric,geography,geologist,geology,geomancer,geomancy,geometry,geomorphology,geophile,germicide,germophobe,germophobia,gerontocracy,gerontology,glossophobe,glossophobia,glyph,gnosis,gnostic,gnosticism,granolithic,graphology,gregarious,gyroscope,halitosis,halophile,hedonism,hedonist,helicoid,helicopter,heliocentric,heliograph,heliometer,heliophile,heliophilia,heliophobe,heliophobia,heliotherapy,heliotrope,helium,hemiptera,hemopathy,hemophilia,hemophobe,hemophobia,hemovore,hepatalgia,hepatopathy,heptagon,heptameter,heptarchy,herbal,herbalist,herbarium,herbicide,herbivore,herbivorous,heterochromatic,heterodox,heterogenous,heteropod,heteroptera,heteropterous,heterotroph,hexadecimal,hexagon,hexahedron,hexameter,hexapod,hexarchy,hibernation,hierarchy,hieroglyph,hippodrome,hippopotamus,histogram,homeostasis,homicide,hominoid,homogenous,homonym,homoptera,homopterous,horoscope,horrendous,horrible,horrid,horrific,horrifying,horror,horticulture,hydrocephaly,hydrology,hydromancy,hydrophilia,hydrophobe,hydrophobia,hydrophone,hydrosphere,hypermania,hyperpnea,hypertrophy,hypnagogic,hypnalgia,hypnoid,hypnophobe,hypnophobia,hypnosis,hypnotherapist,hypnotherapy,hypnotic,hypnotically,hypnotism,hypnotist,hypnotizable,hypnotize,hypodermic,hypodermic,hypomania,hypopnea,hypothalamus,hypothermia,iatrogenic,iatrophobe,iatrophobia,ichthyology,ichthyoplankton,ichthyosaur,iconic,iconoclast,icosahedron,idiolect,idiolect,idiophone,ignominy,illegality,illegitimacy,illegitimate,illiteracy,illiterate,illuminate,immigrant,immigration,immobile,immortal,immutable,impediment,impend,impending,imperturbable,implacable,inalienable,inanimate,inaudible,incessant,incivility,incredible,incredulous,independent,indict,indictment,indomitable,indubitable,inequality,inextricably,infinite,influence,influenza,influx,infrared,infrasonic,inhalation,injustice,innate,innocuous,inquiry,inquisition,inquisitive,inquisitor,insanity,inscribed,insecticide,insectivore,insectivorous,insomnia,insomniac,intact,intangible,integer,integrity,interdependent,intergalactic,intermediary,interment,interplanetary,interregnum,intersect,intersection,interstellar,intractable,intravenous,intrepid,introspection,introvert,invertebrate,inveterate,irrelevance,irrelevancy,irrelevant,isochrone,isogamy,isogloss,isogon,isograft,isogram,isometric,isopod,isosceles,isothermal,jurisdiction,juvenile,kakistocracy,kaleidoscope,keratin,keratosis,kilometer,kleptocracy,kleptomania,kleptomaniac,kryolith,lactase,lactation,lacteal,lacteous,lactic,lactose,laryngoscope,leaven,legal,legalize,legerdemain,legislate,legislation,legislator,legislature,legitimacy,legitimate,legitimation,legitimize,lepidoptera,lepidopterologist,lepidopterology,lepidopterophile,lepidopterophobe,levant,levee,lever,leverage,levitate,levitation,levity,levy,lignophagia,lingual,linguine,linguist,linguistics,literacy,literal,literally,literary,literate,literati,literature,lithemia,lithemia,lithic,lithify,lithium,lithograph,lithographer,lithographic,lithography,lithoid,lithology,lithophane,lithophone,lithophyte,lithosphere,lithotomy,loquacious,lucid,lucifer,luminary,luminiferous,luminosity,luminous,lunatic,macrocosm,macroeconomics,macrofossil,macrofungi,macrolepidoptera,macromolecule,macrophotography,macroscopic,magnanimous,magnificent,maladjusted,malaise,malaria,malcontent,malediction,malefactor,maleficent,malevolence,malevolent,malfeasance,malfunction,malice,malicious,malignant,malnutrition,malodorous,malpractice,maltreatment,manacles,manicure,marinate,marine,mariner,matriarch,matriarchy,matrilect,matrimony,matriphagous,matriphagy,matronym,median,mediation,mediocre,mediterranean,megabat,megalith,megalithic,megalomania,megalomaniac,megaphone,megapod,megatherium,megavirus,melancholy,melanin,mellifluous,mesencephalon,mesocarp,mesoderm,mesograzer,mesolect,meson,mesopotamia,mesosphere,metamorphosis,meteoroid,microbat,microbe,microbiology,microcephalic,microcephaly,microcosm,microeconomics,microfauna,microflora,microfossil,microfungi,microgram,micrograph,microlepidoptera,microlith,microlithic,micron,microorganism,microphone,microphotography,microphotometer,microscope,microscopic,microsecond,microtome,migrate,migration,migratory,militancy,millennium,millimeter,millipede,mime,mimeograph,mimesis,mimetic,mimic,mimicry,misanthrope,misfortune,misnomer,mnemonic,mob,mobile,mobility,mobilization,mobilize,mobocracy,mollify,monarch,monarchist,monarchy,monism,monocarp,monochromatic,monoculture,monogamy,monogastric,monogon,monogon,monogram,monograph,monolingual,monolith,monolithic,monologue,monomania,monomaniac,monomorphic,monophonic,monopod,monopoly,monosyllabic,monotheism,monotheistic,moribund,morpheus,morphine,mortality,mortification,motile,motility,motion,motivate,motivation,motive,motorcycle,multilateral,multilingual,murophile,murophilia,murophobe,murophobia,mutagen,mutant,mutation,mycoflora,mycology,mycorrhiza,myocardium,myopathic,myopathy,myrmecophile,myrmecophilia,myrmecophobe,myrmecophobia,nascent,natal,naumachy,nausea,nauseate,nauseating,nauseous,nautical,nautilus,nebula,nebular,nebulize,nebulous,necrology,necromancer,necromancy,necrophagous,necropolis,necropsy,necrosis,nectar,nectarivore,nekton,nektonic,neolithic,neologism,neonatal,neoorthodoxy,neophile,neophilia,neophobe,neophobia,neophobic,neophyte,neuter,neutral,neutrality,neutralization,neutralize,neutrino,neutron,nihilism,noctambulism,noctambulist,nocturnal,nomenclature,nominate,nonagon,nonet,nosologist,nosology,notochord,noxious,nullification,nullify,nyctophile,nyctophilia,nyctophobe,nyctophobia,obliterate,obliteration,octagon,octahedron,octal,octameter,octant,octarchy,octave,octet,octogenarian,octopus,octuple,octuplet,odometer,oligarch,oligarchy,oligochaete,oligopoly,ombrometer,omniarch,omnidirectional,omnipotent,omnipresent,omniscience,omniscient,omnivore,omphalitis,omphalos,omphaloskepsis,oneiric,oneirology,oneiromancy,oneironaut,oneirophobe,oneirophobia,onerous,ontogeny,ontology,ophthalmitis,ophthalmologist,ophthalmology,opthalmoscope,orb,orbit,orbital,ordinal,ornithocopter,ornithologist,ornithology,ornithomimid,ornithopod,ornithopter,ornithopter,orthodontist,orthodox,orthodoxy,orthogenesis,orthogonal,orthograde,orthographic,orthography,orthopedics,orthoptera,orthopteran,oscilloscope,ossicle,ossify,ossuary,osteoarthritis,osteology,osteoma,osteophagy,osteoporosis,osteoporosis,osteotomy,osteotomy,otolith,otology,otoscope,oval,oviform,oviparous,pachyderm,pachydermia,pacific,pacification,pacifism,pacifist,pacify,pandemic,pandemonium,pandoravirus,pangram,panic,panophobe,panophobia,panopticon,pansophism,pantheon,pantomime,paradox,paralect,parapodia,parasite,parasitism,parasol,parasomnia,pathogen,pathological,patriarch,patriarchy,patrician,patrilect,patrilineal,patrimony,patriot,patriotic,patriotism,patristics,patron,patronage,patronym,pedal,pedestal,pedestrian,pediculicide,pedicure,pediform,pedometer,pelagic,pendant,penguin,pentagon,pentahedron,pentameter,pentarchy,pentathlete,pentathlon,perennial,pericardium,pericarp,pericarp,period,periodic,periodontist,periscope,perissodactyl,perpendicular,pesticide,petrified,petrify,phenotype,philanthropist,philanthropy,philodendron,philologist,philology,philosopher,philosophy,phonautograph,phonograph,photoautotroph,photobiology,photochromism,photolithograph,photophobe,photophobia,photosphere,photosynthesis,photosynthetic,phototroph,phylogeny,physiopathology,pisciculture,pisciform,piscivore,piscivorous,placate,placebo,placid,placoderm,planet,planetarium,planetary,plankton,plateau,platelet,platyhelminth,platypus,platyrrhine,platysma,pneuma,pneumatic,pneumatology,pneumatophore,pneumograph,pneumonectomy,pneumonia,pneumostome,podiatrist,podiatry,podium,podocarp,podomancy,poison,polyandry,polyandry,polyatomic,polycarpy,polycentrism,polychaete,polychotomy,polychromatic,polydactylic,polydipsia,polygala,polygalactia,polygamy,polygastric,polygenesis,polyglot,polygon,polygraph,polygyny,polyhedron,polymer,polymorphic,polymorphism,polyphagous,polyphonic,polyphony,polypnea,polypod,polysemous,polysyllabic,polysynaptic,polytheism,porcupine,porpoise,postbiblical,posthypnotic,potable,potion,prebiblical,predict,predictable,prediction,predictive,prerequisite,prescient,primacy,primal,primarily,primary,primate,primatologist,primatology,prime,primer,primeval,primitive,primitivism,primogeniture,primordial,prism,proclamation,progeny,prognosis,prognostic,prognosticate,prognosticator,prokaryote,protocol,protoderm,protogalaxy,protohistorian,protohistory,protohuman,proton,protophyta,protoplanet,protoplasm,protopod,protostar,prototype,protozoa,protozoologist,protozoology,pseudepigraphy,pseudomorph,pseudonym,pseudopod,pseudorandom,pseudoscience,pseudoscope,pseudoscorpion,psychiatric,psychiatry,psycholinguistics,psychosis,psychosomatic,psychotic,psychotomimetic,pteranodon,pterodactyl,pteropod,pterosaur,pterygoid,punctual,punctuate,punctuation,puncture,pyrogen,pyromancy,pyromania,pyromaniac,pyrometer,pyrophobe,pyrophobia,quadrangle,quadrant,quadraphonic,quadrennium,quadricentennium,quadriceps,quadricycle,quadrilateral,quadrillion,quadripartite,quadrivium,quadruped,quadrupedal,quadruple,quantile,quartet,quartic,quartile,querulous,query,quintet,quintic,quintile,radiation,radiology,radiometer,ramiform,recyclable,recycle,reflux,regal,regent,regicide,regolith,reincarnation,relevance,relevancy,relevant,relief,relieve,renaissance,renal,reniform,renogram,renography,require,requirement,respiration,retardant,retraction,retrospective,rhinencephalon,rhinitis,rhinoceros,rhinoceros,rhinoplasty,rhinoscope,rhinovirus,rhizoid,rhizome,rhizomorph,rhizopus,rhizosphere,rhodium,rhododendron,rhynchocephalian,rhynchosaur,rodenticide,rotifer,rotiform,rupture,sane,sanitarium,sanitary,sanitation,sanity,saprophage,saprotroph,sarcocarp,sarcophagus,scalar,schizocarp,schizogony,schizoid,schizont ,schizophrenia,schizophrenic,schizopod,sclerometer,section,sediment,sedulous,segregation,selenium,selenocentric,selenography,selenologist,selenology,selenophobia,semiannual,senator,senectitude,senescence,senile,senility,senior,seniority,sentient,septet,sextet,similarity,similitude,sinecure,sinistrodextral,sociolect,sociology,solar,solarium,solarize,soliloquy,solitude,solstice,somatic,somnambulant,somnambulate,somnambulism,somnambulist,somnifacient,somniferous,somniloquy,somnolence,somnolent,spatiotemporal,spectroscope,spherical,spirochaete,squirrel,statolith,stellar,stellification,stereoscope,stereoscopic,stethoscope,stratiform,stultify,subaudible,sublingual,subterranean,superfluous,supernova,suspend,suspenders,suspense,suspension,symbiont,symbiosis,symbiotic,symmetry,sympatric,symphony,symposium,syncarp,syndactylism,synonym,tachycardia,tachyometer,tachyon,tacit,taciturn,tact,tactile,tangent,tangential,tangible,tardigrade,tardyon,taxidermy,technophile,technophilia,technophobe,technophobia,telegram,telegraph,telegraphist,telekinesis,telemeter,telepath,telepathy,telephone,teleport,teleportation,telescope,television,temporal,temporality,temporalize,temporarily,temporary,temporize,tenacious,tenacity,teratogen,terrace,terrain,terrarium,terrestrial,terrier,territorial,territory,tetrachord,tetrad,tetragon,tetragrammaton,tetrahedron,tetralogy,tetrameter,tetrarchy,thalamus,theocentric,theocracy,theologian,theology,thermoformation,thermometer,thermophile,thermophilia,thermophobe,thermophobia,thermoscope,timidity,timorous,titaniferous,topolect,toponym,torque,torsion,tortrix,tortuous,toxicology,transfer,transliteration,trepidation,triad,triangle,triarchy,triathlete,triathlon,tricephalic,triceps,triceratops,trichoptera,tricycle,trident,triennial,trigonometry,trihedron,trilingual,trilith,trilogy,trimeter,trio,triple,triplet,triplicate,tripod,triskadekaphobe,triskadekaphobia,triskelion,trivia,trivial,turbulence,tyrannicide,ultrasonic,unanimity,unanimous,uncivilized,unfortunate,unguligrade,unicellular,unicycle,uniform,unilateral,unorthodox,unpatriotic,unpredictable,unsanitary,untenable,utopia,valedictorian,variable,variance,variform,vast,vector,velocipede,venerable,veneration,ventriloquist,ventriloquy,veracity,verdict,verify,verisimilitude,vermicide,vermiculture,vermiform,vermin,vertebrate,vertex,vertical,veteran,viniculture,viniferous,virulence,virulent,virus,voracious,vulcanology,xenobiotic,xenogamy,xenogenesis,xenolect,xenolith,xenology,xenon,xenophile,xenophilia,xenophobe,xenophobia,xeroderma,xerophagy,xerophile,xerophthalmia,xerophyte,xylocarp,xyloid,xylophage,xylophagous,xylophagy,xylophobia,xylophone,xylotomy,zodiac,zoogony,zoology,zoonosis,zoopathology,zoophobe,zoophobia,zygodactyl,zygote".split(",")

# SELECT (value) FROM morpheme WHERE is_root=False AND obscurity < 4 AND curriculum_id='5c110cfc6aa666a547278e98' ORDER BY value ASC;
WORDCRAFT_WORDS_EASY = 'acousticophobe,acousticophobia,acronym,acrophobe,aeronautical,aerophobe,aerophobia,agoraphobe,agoraphobia,algophobe,algophobia,allergic,allergy,alligator,ambidextrous,amorphous,anarchic,anarchy,android,animal,animation,animism,anniversary,annual,annually,annull,anonymous,antebellum,anthophile,anthophilia,anthropoid,anthropology,anthropophobe,anthropophobia,antifungal,antonym,aquaculture,aquanaut,arachnophobe,arachnophobia,arboriculture,archaic,asterisk,asteroid,astral,astrologer,astrology,astronaut,astronomer,astronomy,athlete,athletic,audible,audience,audiophile,audiovisual,auditorium,autograph,autohypnosis,autohypnotic,automatic,aviation,aviator,aviculture,bacteriacide,bacteriology,benefactor,beneficial,beneficiary,benefit,benevolence,benevolent,benign,biannual,biathlete,biathlon,bibliographer,bibliography,bibliology,bibliomania,bibliomaniac,bibliophile,bibliophilia,bibliophobe,bibliophobia,bibliotherapy,bicentennial,bicephalous,biceps,bicycle,bident,biennial,bilateral,bilingual,binary,biology,biped,bipod,carnivorous,cavity,centennial,centimeter,centipede,century,chiropodist,chronometer,chronophobe,chronophobia,circumference,claustrophobe,claustrophobia,contemporaneously,contemporary,contemporize,contort,contortion,cosmetics,cosmetology,cosmologist,cynophile,cynophilia,cynophobe,cynophobia,decade,decagon,decaliter,decathlete,decathlon,december,decibel,decigram,decimal,decimeter,decrypt,democracy,democratic,dendrology,dentophobe,dentophobia,denture,dermatologist,dermatology,digit,digital,disaster,dissimilar,duet,duplicate,elevation,elevator,equality,equilateral,equinophile,equinophilia,equinophobe,equinophobia,ethnocentrism,factoid,ferry,finite,floriculture,florist,fractal,fraction,fragile,fragment,fragmentary,fragmentation,fructivore,fungal,fungicide,fungivore,geography,geologist,geology,geometry,germicide,germophobe,germophobia,giant armadillo,hedonism,hedonist,helicopter,heliophile,heliophilia,heliophobe,heliophobia,hemophobe,hemophobia,herbal,herbalist,herbarium,herbicide,herbivore,herbivorous,hexagon,hexapod,hieroglyph,homicide,homogenous,homonym,horrible,horticulture,hydrophilia,hydrophobe,hydrophobia,hypnophobe,hypnophobia,hypnosis,hypnotherapist,hypnotherapy,hypnotic,hypnotically,hypnotism,hypnotist,hypnotizable,hypnotize,iatrophobe,iatrophobia,illuminate,immortal,immutable,inanimate,inaudible,incredulous,inequality,infinite,insecticide,insectivore,insectivorous,insomnia,insomniac,intermediary,invertebrate,kilometer,levee,lever,leverage,levitate,levitation,levity,lithology,macroeconomics,macrofossil,macrofungi,macromolecule,macrophotography,maladjusted,malcontent,malnutrition,malodorous,malpractice,maltreatment,manicure,mediterranean,megabat,megalith,megaphone,microbat,microbiology,microeconomics,microfossil,microfungi,microgram,micrograph,micron,microorganism,microphone,microscope,microscopic,microsecond,millennium,millimeter,monolingual,monologue,monopod,monosyllabic,multilateral,murophile,murophilia,murophobe,murophobia,mutant,mutation,myrmecophile,myrmecophilia,myrmecophobe,myrmecophobia,nectarivore,neophile,neophilia,neophobe,neophobia,nonagon,nonet,noxious,nyctophile,nyctophobe,nyctophobia,octagon,octahedron,octave,octet,octopus,octuple,octuplet,omnidirectional,omnipotent,omnipresent,omniscience,omniscient,omnivore,orthodontist,orthodox,orthodoxy,oval,panophobe,panophobia,paradox,patriot,patriotic,patriotism,pedal,pedestal,pedestrian,pedicure,pentagon,pentathlete,pentathlon,pesticide,philosopher,philosophy,photophobe,photophobia,piscivore,piscivorous,plateau,poison,polycentrism,polygon,polysyllabic,polysynaptic,postbiblical,potion,prebiblical,predict,predictable,prediction,primacy,prism,protogalaxy,protohistorian,protohistory,protohuman,protoplanet,protostar,prototype,pseudorandom,pseudoscience,pseudoscorpion,pyrophobe,pyrophobia,quadrangle,quadrant,quadrennium,quadricentennium,quadriceps,quadricycle,quadrilateral,quadrillion,quadripartite,quadruped,quadrupedal,quadruple,quartet,quintet,regal,regent,regicide,relevant,relief,relieve,rodenticide,section,semiannual,senator,senile,senility,senior,seniority,septet,similarity,sociology,solitude,spatiotemporal,subaudible,subterranean,taxidermy,technophile,technophilia,technophobe,technophobia,telegram,telegraph,telegraphist,telephone,teleport,teleportation,telescope,television,temporary,terrain,territory,thermometer,thermophile,thermophilia,thermophobe,thermophobia,transfer,triad,triangle,triathlete,triathlon,triceps,triceratops,tricycle,trident,triennial,trigonometry,trilingual,trilogy,triple,triplet,triplicate,tripod,turbulence,tyrannicide,unicellular,uniform,unilateral,unorthodox,unpatriotic,unpredictable,ventriloquist,ventriloquy,vermicide,vermin,vertebrate,vertical,veteran,voracious,xenophile,xenophilia,xenophobe,xenophobia,xylophage,zoology,zoophobe,zoophobia'.split(",")

FILTER_WORDCRAFT_WORDS = []

***
***DEPENDENCY TREE PARSING METHODS***
***

In [None]:
def children_for(json_doc, head_id, recursive=False):
    """Recursively find the children and the children's children for a word in a dependency tree."""    
    results = []
    
    for x in json_doc["tokens"]:
        if x["head"] == head_id and x["id"] != head_id:
            results.append(x)
            
    if recursive and len(results) > 0:
        return results + flatten([children_for(json_doc, r["id"], True) for r in results])
    else:
        return results

def find_token_by_attr(tokens, attr, value):
    results = [t for t in tokens if t[attr] == value]
    return first_or_none(results)

def filter_tokens_by_attr(tokens, attr, value):
    return [t for t in tokens if t[attr] != value]

def filter_tokens_by_attrs(tokens, k_v_pairs):
    for key, value in k_v_pairs:
        tokens = filter_tokens_by_attr(tokens, key, value)
    return tokens

def select_tokens_by_attr(tokens, attr, value):
    return [t for t in tokens if t[attr] == value]

def select_tokens_by_attrs(tokens, k_v_pairs):
    result = []
    for key, value in k_v_pairs:
        result += select_tokens_by_attr(tokens, key, value)
    return result

def get_verbs(json_doc, only=None):
    results = []
    for x in json_doc["tokens"]:
        value = value_for_token(x, json_doc)
        is_verb = x["pos"] in VERB_TAGS
        include = only is None or value in only
        if is_verb and include:
            results.append(x)
    return results

def sorted_values(tokens, json_doc):
    return [t for t in sorted(tokens, key=lambda w: w["id"])]

def value_for_token(token, json_doc):
    return json_doc["text"][token["start"]:token["end"]].lower()

def children_includes_negation(children):
    return any([c for c in children if c["dep"] == "neg"])
    
def VERBOSE_print(msg, VERBOSE):
    if VERBOSE:
        print(msg)

# primary entry point
def mine(json_doc, verbs, VERBOSE):
    results = []
    
    for verb in get_verbs(json_doc, verbs):
        children = children_for(json_doc, verb["id"])

        if children_includes_negation(children):
            VERBOSE_print(f"verb {value_for_token(verb, json_doc)} contains negation, skipping...", VERBOSE)
            continue
        
        VERBOSE_print(f"parsing verb {value_for_token(verb, json_doc)}...", VERBOSE)
        subject = find_token_by_attr(children, "dep", "nsubj")     
        if subject is None:
            continue
            
        VERBOSE_print(f"\n\tsubject is: {value_for_token(subject, json_doc)}", VERBOSE)

        predicates = select_tokens_by_attrs(children, VALID_PREDICATE_K_V_PAIRS)
                    
        subject_deps = children_for(json_doc, subject["id"], True)
        subject_deps = filter_tokens_by_attrs(subject_deps, [("pos", "PUNCT")])
        VERBOSE_print(f"\t\tdependencies are: {', '.join([value_for_token(o, json_doc) for o in subject_deps])}", VERBOSE)
        word = [subject] + subject_deps
        
        properties = []
        
        VERBOSE_print(f"\n\tpredicates are: {', '.join([value_for_token(o, json_doc) for o in predicates])}", VERBOSE)
        
        for o in predicates:
            VERBOSE_print(f"\n\tparsing predicate {value_for_token(o, json_doc)}...", VERBOSE)
            properties.append(o)            
            other_deps = children_for(json_doc, o["id"], True)
            VERBOSE_print(f"\t\tdependencies are: {', '.join([value_for_token(o, json_doc) for o in other_deps])}", VERBOSE)
            properties += other_deps
                  
        
        word = sorted_values(word, json_doc)
        [w.update({"color": "red"}) for w in word]
        properties = sorted_values(properties, json_doc)
        [p.update({"color": "blue"}) for p in properties]
        
        joined = " ".join([value_for_token(t,json_doc) for t in word]) \
            + " - " \
            + " ".join([value_for_token(t,json_doc) for t in properties])

            
        results.append({"subject":word, "properties":properties, "joined":joined})
        
    return results

def mine_sentence(sentence):
    return mine_json_doc(nlp((sentence)).to_json(), COPULAS)

***
***
***

***
***RANDOM WIKIPEDIA TEST***
***

In [None]:
def get_lemmas(tokens):
    return [t["lemma"]for t in tokens]

def wordcraft_words_in(prop, wordcraft_words=WORDCRAFT_WORDS):
    lemmas = get_lemmas(prop["subject"]) + get_lemmas(prop["properties"])
    words = [w for w in wordcraft_words if w in lemmas]
    words = [w for w in words if w not in FILTER_WORDCRAFT_WORDS]
    return {"words":words,"count":len(words)}

def has_wordcraft_word(prop, wordcraft_words=WORDCRAFT_WORDS):
    return wordcraft_words_in(prop, wordcraft_words)["count"] > 0

def insert_str(string, str_to_insert, index):
    return string[:index] + str_to_insert + string[index:]   

def get_sample_wikipedia():
    url = "https://olivers-things.s3.amazonaws.com/sample-wikipedia.json"
    articles = requests.get(url).json()
    print(f"fetched {len(articles)} articles")
    return articles

def mine_wikipedia_sentence(sentence, counter=None, sentences=[], article=None):
    doc = nlp(sentence)
    json_doc = doc.to_json()
    properties = mine(json_doc, COPULAS, False)
    properties = [p for p in properties if has_wordcraft_word(p)]

    count = sum([wordcraft_words_in(p)["count"] for p in properties])
    words = flatten([wordcraft_words_in(p)["words"] for p in properties])
    ranges = sorted(flatten([p["subject"] + p["properties"] for p in properties]), key=lambda w: w["start"])

    increment = 0

    for r in ranges:
        start_str = f"<span style='color:{r['color']};'><b>"
        sentence = insert_str(sentence, start_str, r["start"] + increment)
        increment += len(start_str)
        end_str = '</b></span>'
        sentence = insert_str(sentence, end_str, r["end"] + increment)
        increment += len(end_str)

    tagged = ""

    if counter and (counter > 0):
        tagged += "<p style='font-size:1.1em;color: gray;margin:0;line-height:25px;'>" + sentences[counter - 1] + "</p>"

    tagged += "<p style='font-size:1.2em;margin:0;line-height:25px;'>" + sentence + "</p>"

    if counter and (counter + 1 < len(sentences)):
        tagged += "<p style='font-size:1.1em;color: gray;margin:0;line-height:25px;'>" + sentences[counter + 1] + "</p>"            

    if len(properties) > 0:
        return tagged \
            + "<p style='margin-bottom:5px 0;font-size:0.9em'>" + "\n".join([p["joined"] for p in properties]) + "</p>" \
            + f"<p style='margin: 0 0 50px 0;font-size:0.9em'><i>found wordcraft word(s) '{' / '.join(words)}' in article '{article['title']}'</i></p>"

def mine_wikipedia_articles(limit = 25):
    results = ""

    random.shuffle(articles)

    for article in articles[0:limit]:
        sentences = article["sentences"]
        counter = 0
        
        for sentence in sentences:
            result = mine_wikipedia_sentence(sentence, counter, sentences, article)
            if result:
                results += result
            counter += 1
    
    return results

In [None]:
articles = get_sample_wikipedia()

In [None]:
FILTER_WORDCRAFT_WORDS = ["capital", "animal"]

NUMBER_OF_ARTICLES_TO_MINE = 3

html_print(mine_wikipedia_articles(NUMBER_OF_ARTICLES_TO_MINE))

***
***
***

***
***FLASK SERVER FOR WIKI DUMPSTER DIVE INGESTION + BINARY TEXT CLASSIFICATION***
***

In [None]:
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
from flask import Flask, request, jsonify
import spacy

binary_text_classification = spacy.load("output/model-best")

def classify_wikipedia_article(article):
    doc = binary_text_classification(" ".join(article["sentences"]))
    article["result"] = 1 if doc.cats['positive'] > 0.5 else 0
    return article

def flatten(list_of_lists):
    if len(list_of_lists) == 0:
        return list_of_lists
    if isinstance(list_of_lists[0], list):
        return flatten(list_of_lists[0]) + flatten(list_of_lists[1:])
    return list_of_lists[:1] + flatten(list_of_lists[1:])

def mine_wikipedia_article_BTC(article):
    surrounding_sentences_count = 2
    counter = 0
    results = []
    
    for sentence in article["sentences"]:
        properties = mine(nlp(sentence).to_json(), COPULAS, False)
        properties = [p for p in properties if has_wordcraft_word(p, WORDCRAFT_WORDS_EASY)]
        wordcraft_words = [wordcraft_words_in(p, WORDCRAFT_WORDS_EASY)["words"] for p in properties]
        wordcraft_words = list(set(flatten(wordcraft_words)))

        if len(properties) > 0:
            start_idx = max(counter-surrounding_sentences_count,0)
            end_idx = min(counter+surrounding_sentences_count+1,len(article["sentences"]))
            passage = article["sentences"][start_idx:end_idx]
            results.append({
                "passage": passage,
                "properties": [p["joined"] for p in properties],
                "words": wordcraft_words,
                "sentence_idx":counter
            })

        counter += 1

    return results

app = Flask(__name__)

@app.route("/classify", methods=["POST"])
def classify():
    classified = [classify_wikipedia_article(article) for article in request.json]
    all_results = []
    for article in classified:
        if article["result"] == 0:
            print(f"skipping {article['title']}")
        else:
            print(f"parsing {article['title']}")
            results = mine_wikipedia_article_BTC(article)
            if len(results) > 0:
                all_results.append({ "title": article['title'], "results": flatten(results)})
    return jsonify(all_results)

if __name__ == '__main__':
    run_simple('localhost', 9000, app)

***
***
***

***
***SINGLE SENTENCE TEST AND VISUALIZATION***
***

In [None]:
def mine_and_describe_sentence(sentence):
    json_doc = nlp((sentence)).to_json()
    predictions = mine(json_doc, COPULAS, True)
    print("\nproperty mining algorithm returned:", [p["joined"] for p in predictions])
    describe(sentence)

In [None]:
mine_and_describe_sentence("Koalas are not terrestrial.")

***
***
***

***
***UNIT TESTS***
***

In [None]:
TESTS = [
    {
        "sentence": "The whale is an animal.",
        "properties": [
            "the whale - an animal"
        ]
    },
    {
        "sentence": "The blue whale is an animal.",
        "description": "adjective on subject",
        "properties": [
            "the blue whale - an animal"
        ]
    },
    {
        "sentence": "The whale is a quadrupedal mammal.",
        "description": "2 values in predicate",
        "properties": [
            "the whale - a quadrupedal mammal"
        ]
    },
    {
        "sentence": "The chameleon is a polychromatic arboreal insectivore.",
        "description": "3 values in predicate",
        "properties": [
            "the chameleon - a polychromatic arboreal insectivore"
        ]
    },
    {
        "sentence": "Carnivorous spiders are common.",
        "description": "classifying adjective on subject",
        "properties": [
            "carnivorous spiders - common"
        ]
    },
    {
        "sentence": "All cephalopods are nocturnal.",
        "description": "logical modifier",
        "properties": [
            "all cephalopods - nocturnal"
        ]
    },
    {
        "sentence": "The majority of cephalopods are nocturnal.",
        "description": "logical modifier",
        "properties": [
            "the majority of cephalopods - nocturnal"
        ]
    },
    {
        "sentence": "Some cephalopods are nocturnal.",
        "description": "logical modifier",
        "properties": [
            "some cephalopods - nocturnal"
        ]
    },
    {
        "sentence": "No cephalopods are nocturnal.",
        "description": "logical modifier",
        "properties": [
            "no cephalopods - nocturnal"
        ]
    },
    {
        "sentence": "Most terrestrial gastropods are hermaphrodites.",
        "description": "logical modifier + multiple value on subject",
        "properties": [
            "most terrestrial gastropods - hermaphrodites"
        ]
    },
    {
        "sentence": "The simplest gastropods are the limpets and abalone.",
        "description": "superlative modifier on subject",
        "properties": [
            "the simplest gastropods - the limpets and abalone"
        ]
    },
    # failure is in case-ing of Cephalopoda (#minor)
    {
        "sentence": "A cephalopod is any member of the molluscan class Cephalopoda such as a squid, octopus, cuttlefish, or nautilus.",
        "description": "long predicate",
        "properties": [
            "a cephalopod - any member of the molluscan class Cephalopoda such as a squid, octopus, cuttlefish, or nautilus"
        ]
    },
    {
        "sentence": "Gastropods were described as gastropodes by Georges Cuiver.",
        "description": "is + participle",
        "properties": [
            "gastropods - gastropodes by Georges Cuiver"
        ]
    },
    {
        "sentence": "Cephalopods are widely regarded as the most intelligent of the invertebrates.",
        "description": "is + participle",
        "properties": [
            "cephalopods - the most intelligent of the invertebrates"
        ]
    },
    {
        "sentence": "Cephalopods became dominant during the Ordovician period, represented by primitive nautiloids.",
        "description": "variant on is (become, etc.)",
        "properties": [
            "cephalopods - dominant during the Ordovician period, represented by primitive nautiloids"
        ]
    },
    {
        "sentence": "Koalas are not terrestrial.",
        "description": "skip negation",
        "properties": []
    },
    {
        "sentence": "Paedophryne amauensis is the smallest frog in the world.",
        "description": "include determiners",
        "properties": [
            "paedophryne amauensis - the smallest frog in the world"
        ]
    },    
    {
        "sentence": "The brain of the cephalopod is invisible.",
        "description": "include all children of subject",
        "properties": [
            "the brain of the cephalopod - invisible"
        ]
    },
    {
        "sentence": "Sound waves with frequency above 20 kHz are inaudible to humans.",
        "description": "include all children of subject",
        "properties": [
            "sound waves with frequency above 20 kHz - inaudible to humans"
        ]
    },
    {
        "sentence": "The cephalopods in the ocean are carnivorous.",
        "description": "include all children of subject",
        "properties": [
            "the cephalopods in the ocean - carnivorous"
        ]
    },
    {
        "sentence": "A Geiger counter is an instrument that measures ionizing radiation.",
        "description": "include all children of predicate",
        "properties": [
            "a geiger counter - an instrument that measures ionizing radiation"
        ]
    },
    {
        "sentence": "The cephalopod's brain is invisible.",
        "description": "include all children of predicate",
        "properties": [
            "the cephalopod 's brain - invisible"
        ]
    },
#     {
#         "sentence": "",
#         "description": "",
#         "properties": [
#             ""
#         ]
#     }
]

def run_tests():
    for test in TESTS:
        json_doc = nlp((test["sentence"])).to_json()
        predictions = [p["joined"] for p in mine(json_doc, COPULAS, False)]
        failed = str(predictions) != str(test["properties"])
        print(
            "\n",
            "FAILED-\n" if failed else "",
            test["sentence"],
            f"({test['description']})" if "description" in test else ""
        )
        if failed:
            print(f"{predictions} != {test['properties']}")
        else:
            print(test["properties"])
        
        
# more test cases:
# - https://docs.google.com/spreadsheets/d/1ggEybPCO9LVZkWAsMJCER9CTXUe36QcaKhctt9wGlJg/edit#gid=0
run_tests()