## Setup

In [1]:
# Install Java
!apt-get install -y openjdk-11-jdk

# Download Stanford CoreNLP
!wget http://nlp.stanford.edu/software/stanford-corenlp-4.5.5.zip
!unzip stanford-corenlp-4.5.5.zip

# Download English model
!wget http://nlp.stanford.edu/software/stanford-corenlp-4.5.5-models-english.jar -P stanford-corenlp-4.5.5/

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 5,367 kB of archives.
After this operation, 15.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [2]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [3]:
import stanza

# Download and initialize English pipeline
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| ner          | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


## Event Extraction

In [4]:
text = "John bought a new laptop from Amazon."
doc = nlp(text)

for sent in doc.sentences:
    for word in sent.words:
        if word.upos in ['VERB', 'AUX']:
            print(f"Event: {word.text}")
            # Print arguments (subject, object, etc.)
            for dep in sent.words:
                if dep.head == word.id:
                    print(f"  Argument ({dep.deprel}): {dep.text}")

Event: bought
  Argument (nsubj): John
  Argument (obj): laptop
  Argument (obl): Amazon
  Argument (punct): .


## Temporal Information Extraction

In [None]:
text = "The meeting was held on March 3rd, 2024 at 10 AM."
doc = nlp(text)

for ent in doc.ents:
    if ent.type == 'DATE' or ent.type == 'TIME':
        print(f"Temporal expression: {ent.text} -> {ent.type}")

Temporal expression: March 3rd, 2024 -> DATE
Temporal expression: 10 AM -> TIME


## Question Answering Pipelines (Syntax-aware)

In [None]:
question = "Who bought the laptop?"
context = "John bought a new laptop from Amazon."

doc = nlp(context)
# Use dependency parsing to extract subject/object
for sent in doc.sentences:
    for word in sent.words:
        if word.deprel == 'nsubj':
            print(f"Possible Answer: {word.text}")

Possible Answer: John


## Structure-Aware Text Summarization (basic compression)

In [None]:
text = "The manager, who was extremely busy, scheduled a meeting with the team."
doc = nlp(text)

for sent in doc.sentences:
    essential = [word.text for word in sent.words if word.deprel in ['nsubj', 'obj', 'root']]
    print("Compressed sentence:", ' '.join(essential))


Compressed sentence: manager who scheduled meeting


## Multilingual Document Analysis

In [None]:
# Hindi
stanza.download('hi')
hi_nlp = stanza.Pipeline(lang='hi')

text = "राम ने सीता के लिए फूल खरीदे।"
doc = hi_nlp(text)
for sent in doc.sentences:
    for word in sent.words:
        print(f"{word.text} ({word.upos}) - {word.deprel} -> {word.head}")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...


Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


राम (PROPN) - nsubj -> 7
ने (ADP) - case -> 1
सीता (PROPN) - obl -> 7
के (ADP) - case -> 3
लिए (ADP) - case -> 3
फूल (NOUN) - obj -> 7
खरीदे (VERB) - root -> 0
। (PUNCT) - punct -> 7


## Textual Entailment Features (syntactic/semantic features)


In [None]:
text1 = "The cat sat on the mat."
text2 = "A feline was sitting on a rug."

doc1 = nlp(text1)
doc2 = nlp(text2)

def get_dependencies(doc):
    return set((w.text, w.deprel) for s in doc.sentences for w in s.words)

print("Overlap in syntactic roles:", get_dependencies(doc1) & get_dependencies(doc2))


Overlap in syntactic roles: {('.', 'punct'), ('on', 'case')}


## Named Entity Normalization / Linking

In [None]:
text = "Apple was founded by Steve Jobs in Cupertino."

doc = nlp(text)
for ent in doc.ents:
    print(f"Entity: {ent.text} ({ent.type})")
    # Placeholder: Entity linking to KB could go here (e.g., Wikidata, DBPedia)

Entity: Apple (ORG)
Entity: Steve Jobs (ORG)
Entity: Cupertino (GPE)


## Syntax-aware Machine Translation Support

In [None]:
text = "The dog chased the cat under the table."
doc = nlp(text)

for sent in doc.sentences:
    print("Tree Structure:")
    print(sent.constituency)
    # Tree structure can be passed as features to a syntax-aware MT model

Tree Structure:
(ROOT (S (NP (DT The) (NN dog)) (VP (VBD chased) (NP (DT the) (NN cat)) (PP (IN under) (NP (DT the) (NN table)))) (. .)))


## Knowledge Graph Triple Extraction

In [None]:
text = "Elon Musk founded SpaceX and Tesla."
doc = nlp(text)

for sent in doc.sentences:
    subj, obj, rel = None, None, None
    for word in sent.words:
        if word.deprel == 'nsubj':
            subj = word.text
        elif word.deprel == 'obj':
            obj = word.text
        elif word.upos == 'VERB':
            rel = word.text
    if subj and obj and rel:
        print(f"Triple: ({subj}, {rel}, {obj})")

Triple: (Elon, founded, SpaceX)
