In [None]:
import random

import nltk
from nltk.parse import DependencyGraph

def get_subject_verb_object(sentence):
    """
    Extracts subject, verb and object from sentence using NLTK dependency parser.
    """
    # Load the parser
    parser = nltk.parse.corenlp.CoreNLPDependencyParser(url='http://localhost:9000')

    # Parse the sentence
    parse = next(parser.raw_parse(sentence)).to_conll(4).split('\n')

    # Extract subject, verb and object
    subject = None
    verb = None
    obj = None

    for p in parse:
        if 'subj' in p:
            subject = p.split('\t')[2]
        elif 'obj' in p:
            obj = p.split('\t')[2]
        elif 'root' in p:
            verb = p.split('\t')[2]

    return subject, verb, obj

sentence = "The quick brown fox jumps over the lazy dog"
subject, verb, obj = get_subject_verb_object(sentence)
print(f"Subject: {subject}, Verb: {verb}, Object: {obj}")

In [2]:
import spacy

def get_subject_verb_object(sentence):
    """
    Extracts subject, verb and object from sentence using spaCy dependency parser.
    """
    # Load the parser
    nlp = spacy.load('en_core_web_sm')

    # Parse the sentence
    doc = nlp(sentence)

    # Extract subject, verb and object
    subject = None
    verb = None
    obj = None

    for token in doc:
        if 'subj' in token.dep_:
            subject = token.text
        elif 'obj' in token.dep_:
            obj = token.text
        elif 'ROOT' in token.dep_:
            verb = token.text

    return subject, verb, obj

sentence = "The quick brown fox jumps over the lazy dog"
subject, verb, obj = get_subject_verb_object(sentence)
print(f"Subject: {subject}, Verb: {verb}, Object: {obj}")

Subject: fox, Verb: jumps, Object: dog


In [3]:
def get_all_subject_verb_object(sentence):
    """
    Extracts subject, verb and object from sentence using spaCy dependency parser.
    """
    # Load the parser
    nlp = spacy.load('en_core_web_sm')

    # Parse the sentence
    doc = nlp(sentence)

    # Extract subject, verb and object
    subject = []
    verb = []
    obj = []

    for token in doc:
        if 'subj' in token.dep_:
            subject.append(token.text)
        elif 'obj' in token.dep_:
            obj.append(token.text)
        elif 'ROOT' in token.dep_:
            verb.append(token.text)

    return subject, verb, obj

In [4]:
def get_first_subject_verb_object(sentence):
    """
    Extracts subject, verb and object from sentence using spaCy dependency parser.
    """
    # Load the parser
    nlp = spacy.load('en_core_web_sm')

    # Parse the sentence
    doc = nlp(sentence)

    # Extract subject, verb and object
    subject = ""
    verb = ""
    obj = ""

    for token in doc:
        if 'subj' in token.dep_ and not subject:
            subject = token.text
        elif 'obj' in token.dep_ and not obj:
            obj = token.text
        elif 'ROOT' in token.dep_ and not verb:
            verb = token.text

    return subject, verb, obj

In [80]:
sentence = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday ."

In [78]:
sentence = "Young actor says he has no plans to fritter his cash away ."

In [76]:
sentence = "Radcliffe's earnings from first five Potter films have been held in trust fund ."

In [81]:
subject, verb, obj = get_all_subject_verb_object(sentence)
print(f"Subject: {subject}, Verb: {verb}, Object: {obj}")

Subject: ['Radcliffe', 'he'], Verb: ['gets'], Object: ['fortune']


In [9]:
from nltk.tokenize import sent_tokenize

In [10]:
def get_subject_verb_obj_new_label(sents):
    subjects, verbs, objs = [], [], []
    for sent in sent_tokenize(sents):
        subject, verb, obj = get_first_subject_verb_object(sent)
        subjects.append(subject)
        verbs.append(verb)
        objs.append(obj)
    res = f"Subject: {' '.join(subjects)} Verb: {' '.join(verbs)} Object: {' '.join(objs)} [SEP] {sents}"
    return res

In [11]:
def map_subject_verb_obj(example):
    import spacy
    summary_column = "highlights"
    from nltk.tokenize import sent_tokenize
    def get_first_subject_verb_object(sentence):
        """
        Extracts subject, verb and object from sentence using spaCy dependency parser.
        """
        # Load the parser
        nlp = spacy.load('en_core_web_sm')

        # Parse the sentence
        doc = nlp(sentence)

        # Extract subject, verb and object
        subject = ""
        verb = ""
        obj = ""

        for token in doc:
            if 'subj' in token.dep_ and not subject:
                subject = token.text
            elif 'obj' in token.dep_ and not obj:
                obj = token.text
            elif 'ROOT' in token.dep_ and not verb:
                verb = token.text

        return subject, verb, obj
    def get_subject_verb_obj_new_label(sents):
        subjects, verbs, objs = [], [], []
        for sent in sent_tokenize(sents):
            subject, verb, obj = get_first_subject_verb_object(sent)
            subjects.append(subject)
            verbs.append(verb)
            objs.append(obj)
        res = f"Subject: {' '.join(subjects)} Verb: {' '.join(verbs)} Object: {' '.join(objs)} [SEP] {sents}"
        return res
    return {"summary_column": get_subject_verb_obj_new_label(example[summary_column])}

In [87]:
def batch_map_all_subject_verb_obj(examples):
    import spacy
    summary_column = "highlights"
    from nltk.tokenize import sent_tokenize
    # Load the parser
    nlp = spacy.load('en_core_web_sm')
    def get_all_subject_verb_object(sentence):
        """
        Extracts subject, verb and object from sentence using spaCy dependency parser.
        """
        # Load the parser
        nlp = spacy.load('en_core_web_sm')

        # Parse the sentence
        doc = nlp(sentence)

        # Extract subject, verb and object
        subject = []
        verb = []
        obj = []

        for token in doc:
            if 'subj' in token.dep_:
                subject.append(token.text)
            elif 'obj' in token.dep_:
                obj.append(token.text)
            elif 'ROOT' in token.dep_:
                verb.append(token.text)

        return subject, verb, obj
    def get_subject_verb_obj_new_label(sents):
        subjects, verbs, objs = [], [], []
        for sent in sent_tokenize(sents):
            subject, verb, obj = get_all_subject_verb_object(sent)
            subjects.extend(subject)
            verbs.extend(verb)
            objs.extend(obj)
        res = f"Subject: {' '.join(subjects)} Verb: {' '.join(verbs)} Object: {' '.join(objs)} [SEP] {sents}"
        return res
    summarys = []
    for summary in examples[summary_column]:
        summarys.append(get_subject_verb_obj_new_label(summary))
    return {"summary_column": summarys}

In [10]:
nlp = spacy.load('en_core_web_sm')

# Parse the sentence
doc = nlp(sentence)
for token in doc:
    print(token.dep_)

compound
compound
compound
compound
nsubj
ROOT
nmod
nummod
compound
dobj
mark
nsubj
advcl
nummod
npadvmod
punct


In [9]:
doc

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .

In [70]:
import datasets
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')
dataset

Found cached dataset cnn_dailymail (D:/ProgramData/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [14]:
dataset["train"][0]["highlights"].split("\n")

['Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .',
 'Young actor says he has no plans to fritter his cash away .',
 "Radcliffe's earnings from first five Potter films have been held in trust fund ."]

In [46]:
get_subject_verb_obj_new_label(dataset["train"][0]["highlights"])

"Subject: Radcliffe actor earnings Verb: gets says held Object: fortune plans films [SEP] Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."

In [47]:
sent_tokenize(dataset["train"][1]["highlights"].replace("\n", ". \n"))

['Mentally ill inmates in Miami are housed on the "forgotten floor".',
 'Judge Steven Leifman says most are there as a result of "avoidable felonies".',
 'While CNN tours facility, patient shouts: "I am the son of the president".',
 "Leifman says the system is unjust and he's fighting for change ."]

In [None]:
sub_dataset = dataset.map(batch_map_all_subject_verb_obj, batched=True, num_proc=15)

               

In [49]:
print(sub_dataset["train"][3]["summary_column"])

Subject: none President Bush Verb: says reclaims undergoes Object: procedure powers colonoscopy [SEP] Five small polyps found during procedure; "none worrisome," spokesman says .
President reclaims powers transferred to vice president .
Bush undergoes routine colonoscopy at Camp David .


In [56]:
nlp = spacy.load('en_core_web_sm')
sentence = sub_dataset["train"][3]["highlights"]
# Parse the sentence
doc = nlp(sentence)
for token in doc:
    print(token, token.dep_)
    print([i for i in token.children])

Five nummod
[]
small amod
[]
polyps ccomp
[Five, small, found, ;, worrisome]
found acl
[during]
during prep
[procedure]
procedure pobj
[]
; punct
[]
" punct
[]
none nsubj
[]
worrisome amod
[", none]
, punct
[]
" punct
[]
spokesman nsubj
[]
says ROOT
[polyps, ,, ", spokesman, .]
. punct
[
]

 dep
[]
President nsubj
[]
reclaims ROOT
[President, powers, .]
powers dobj
[transferred]
transferred acl
[to]
to prep
[president]
vice compound
[]
president pobj
[vice]
. punct
[
]

 dep
[]
Bush nsubj
[]
undergoes ROOT
[Bush, colonoscopy, at, .]
routine amod
[]
colonoscopy dobj
[routine]
at prep
[David]
Camp compound
[]
David pobj
[Camp]
. punct
[]


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [None]:
tokenizer.batch_decode([])

In [71]:
input_ids = tokenizer(sub_dataset["train"][3]["summary_column"].replace("[SEP]", "</s>"))["input_ids"]
print(input_ids)

[0, 47159, 35, 4146, 270, 3516, 38132, 35, 161, 20507, 29, 10946, 293, 35671, 35, 7089, 4361, 17735, 17591, 16572, 1437, 2, 4934, 650, 11424, 3275, 303, 148, 7089, 131, 22, 39763, 29611, 60, 1565, 161, 479, 50118, 6517, 20507, 29, 4361, 7225, 7, 2626, 394, 479, 50118, 43294, 10946, 293, 6108, 17735, 17591, 16572, 23, 4746, 871, 479, 2]


In [24]:
tokenizer.decode(input_ids)

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [74]:
tokenizer.convert_ids_to_tokens(input_ids)

['<s>',
 'Subject',
 ':',
 'Ġnone',
 'ĠPresident',
 'ĠBush',
 'ĠVerb',
 ':',
 'Ġsays',
 'Ġreclaim',
 's',
 'Ġundergo',
 'es',
 'ĠObject',
 ':',
 'Ġprocedure',
 'Ġpowers',
 'Ġcolon',
 'osc',
 'opy',
 'Ġ',
 '</s>',
 'ĠFive',
 'Ġsmall',
 'Ġpoly',
 'ps',
 'Ġfound',
 'Ġduring',
 'Ġprocedure',
 ';',
 'Ġ"',
 'none',
 'Ġworrisome',
 ',"',
 'Ġspokesman',
 'Ġsays',
 'Ġ.',
 'Ċ',
 'President',
 'Ġreclaim',
 's',
 'Ġpowers',
 'Ġtransferred',
 'Ġto',
 'Ġvice',
 'Ġpresident',
 'Ġ.',
 'Ċ',
 'Bush',
 'Ġundergo',
 'es',
 'Ġroutine',
 'Ġcolon',
 'osc',
 'opy',
 'Ġat',
 'ĠCamp',
 'ĠDavid',
 'Ġ.',
 '</s>']

In [35]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<sep>']}

In [22]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<sep>"]})

1

In [36]:
tokenizer.convert_tokens_to_ids("<sep>")

50265

In [None]:
tokenizer.sp

In [70]:
tokenizer.special_tokens_map['sep_token']

'</s>'

In [18]:
tokenizer("summary:")

{'input_ids': [0, 48600, 35, 2], 'attention_mask': [1, 1, 1, 1]}

In [69]:
tokenizer("\n")

{'input_ids': [0, 50118, 2], 'attention_mask': [1, 1, 1]}

In [13]:
input_ids = tokenizer(["a </s> b c", "a b </s> c"], return_tensors="pt")["input_ids"]
input_ids

tensor([[   0,  102, 1437,    2,  741,  740,    2],
        [   0,  102,  741, 1437,    2,  740,    2]])

In [25]:
input_ids = tokenizer(["a <sep> b c", "a b <sep> c"], return_tensors="pt")["input_ids"]
input_ids

tensor([[    0,   102,  1437, 50265,   741,   740,     2],
        [    0,   102,   741,  1437, 50265,   740,     2]])

In [32]:
input_ids = tokenizer(["a <sep>", "<sep> a"], return_tensors="pt", padding=True)["input_ids"]
input_ids

tensor([[    0,   102,  1437, 50265,     2],
        [    0, 50265,    10,     2,     1]])

In [34]:
tokenizer.convert_ids_to_tokens(1437)

'Ġ'

In [33]:
tokenizer.batch_decode(input_ids)

['<s>a <sep></s>', '<s><sep> a</s><pad>']

In [6]:
input_ids == 2

tensor([[False, False, False,  True, False, False,  True],
        [False, False, False, False,  True, False,  True]])

In [7]:
import torch

x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
mask = (x == 2).cumsum(dim=1) == 1
x.masked_fill_(mask, 2)
x

tensor([[ 1,  2,  2],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])

In [37]:
import torch

mask = (input_ids == 50265).cumsum(dim=1) == 0
input_ids.masked_fill_(mask, 0)
input_ids

tensor([[    0,     0,     0, 50265,     2],
        [    0, 50265,    10,     2,     1]])

In [41]:
print(tokenizer.decode([0, 47159, 35, 7507, 1253, 2, 12979, 37, 19971, 792, 38132, 35, 161, 3032, 16766, 35671, 35, 696, 107, 5402, 123, 14497, 1129, 568, 1355, 1437, 50265, 1560, 7507, 1253, 12979, 161, 37, 34, 393, 1317, 143, 696, 50, 3674, 148, 39, 158, 107
, 25, 5402, 479, 50118, 45699, 29, 39, 19971, 33, 3032, 123, 8225, 187, 567, 66, 15, 39, 1363, 14497, 479, 50118, 26751, 792, 16766, 1129, 50, 8257, 13, 568, 45, 7, 11007, 39, 1355, 479, 2]))

<s>Subject: Klans</s>nic he supervisors board Verb: says treated refuses Object: issue years principal him orientation comment decision contract <sep> Tom Klansnic says he has never heard any issue or complaint during his 10 years as principal.
Claims his supervisors have treated him differently since coming out on his sexual orientation.
School board refuses comment or explanation for decision not to renew his contract.</s>


In [42]:
import torch

def set_second_zero(tensor):
    for i in range(tensor.shape[0]):
        zero_count = 0
        for j in range(tensor.shape[1]):
            if tensor[i][j] == 0:
                zero_count += 1
                if zero_count == 2:
                    break
            else:
                tensor[i][j] = 0

    return tensor

tensor = torch.tensor([[0, 2, 3, 4], [0, 6, 0, 8], [0, 10, 11, 0], [0, 0, 15, 16]])
print(set_second_zero(tensor))

tensor([[ 0,  0,  0,  0],
        [ 0,  0,  0,  8],
        [ 0,  0,  0,  0],
        [ 0,  0, 15, 16]])


In [44]:
tensor = torch.tensor([[0, 2, 3, 4], [0, 6, 0, 8], [0, 10, 11, 0], [0, 0, 15, 16]])
tensor

tensor([[ 0,  2,  3,  4],
        [ 0,  6,  0,  8],
        [ 0, 10, 11,  0],
        [ 0,  0, 15, 16]])

In [45]:
tensor == 0

tensor([[ True, False, False, False],
        [ True, False,  True, False],
        [ True, False, False,  True],
        [ True,  True, False, False]])

In [64]:
t = "a b c summary: d e f"

In [66]:
t = "a b c"

In [67]:
pos = t.find("summary:")
t[pos if pos != -1 else 0:]

'a b c'

In [61]:
pos == False

False

In [62]:
pos if pos != -1 else 0

0

In [97]:
def batch_map_all_subject_verb_obj(examples):
        import random
        prompt_sep_token = "Summary:"
        summary_column = "highlights"
        import spacy
        from nltk.tokenize import sent_tokenize
        # Load the parser
        nlp = spacy.load('en_core_web_sm')

        def get_all_subject_verb_object(sentence):
            """
            Extracts subject, verb and object from sentence using spaCy dependency parser.
            """
            # Parse the sentence
            doc = nlp(sentence)

            # Extract subject, verb and object
            subject = []
            verb = []
            obj = []

            for token in doc:
                if 'subj' in token.dep_:
                    subject.append(token.text)
                elif 'obj' in token.dep_:
                    obj.append(token.text)
                elif 'ROOT' in token.dep_:
                    verb.append(token.text)

            return subject, verb, obj

        def get_subject_verb_obj_new_label(sents):
            subjects, verbs, objs = [], [], []
            for sent in sent_tokenize(sents):
                subject, verb, obj = get_all_subject_verb_object(sent)
                subjects.extend(subject)
                verbs.extend(verb)
                objs.extend(obj)
            # subjects = random.sample(subjects, min(len(subjects), 6))
            # verbs = random.sample(verbs, min(len(verbs), 6))
            # objs = random.sample(objs, min(len(objs), 6))
            res = f"Subjects: {', '.join(subjects)}. Predicate: {', '.join(verbs)}. Object: {', '.join(objs)}. {prompt_sep_token} {sents}"
            return res

        summarys = []
        for summary in examples[summary_column]:
            summarys.append(get_subject_verb_obj_new_label(summary))
        return {summary_column: summarys}


In [98]:
sub_dataset = dataset.map(batch_map_all_subject_verb_obj, batched=True, num_proc=15)

                                             

In [99]:
import json

out = open("ChatGLM-6B\ptuning\data\cnn_dailymail_svo_train.json", "w", encoding="utf-8")
for item in sub_dataset["train"]:
    out.write(json.dumps({"article": item["article"], "highlights": item["highlights"].replace("\n", " ")}) + "\n")
out.close()

out = open("ChatGLM-6B\ptuning\data\cnn_dailymail_svo_test.json", "w", encoding="utf-8")
for item in sub_dataset["test"]:
    out.write(json.dumps({"article": item["article"], "highlights": item["highlights"].replace("\n", " ")}) + "\n")
out.close()

In [80]:
random.sample([1, 2, 3, 4, 5], 2)

[3, 2]

In [88]:
highlights = "Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis ."

In [92]:
import random
prompt_sep_token = "Summary:"
summary_column = "highlights"
import spacy
from nltk.tokenize import sent_tokenize
# Load the parser
nlp = spacy.load('en_core_web_sm')

def get_all_subject_verb_object(sentence):
    """
    Extracts subject, verb and object from sentence using spaCy dependency parser.
    """
    # Parse the sentence
    doc = nlp(sentence)

    # Extract subject, verb and object
    subject = []
    verb = []
    obj = []

    for token in doc:
        print(token, token.dep_)
        if 'subj' in token.dep_:
            subject.append(token.text)
        elif 'obj' in token.dep_:
            obj.append(token.text)
        elif 'ROOT' in token.dep_:
            verb.append(token.text)

    return subject, verb, obj

def get_subject_verb_obj_new_label(sents):
    subjects, verbs, objs = [], [], []
    for sent in sent_tokenize(sents):
        subject, verb, obj = get_all_subject_verb_object(sent)
        subjects.extend(subject)
        verbs.extend(verb)
        objs.extend(obj)
    # subjects = random.sample(subjects, min(len(subjects), 6))
    # verbs = random.sample(verbs, min(len(verbs), 6))
    # objs = random.sample(objs, min(len(objs), 6))
    res = f"Subjects: {','.join(subjects)}. Predicate: {','.join(verbs)}. Object: {','.join(objs)}. {prompt_sep_token} {sents}"
    return res

In [93]:
get_subject_verb_obj_new_label(highlights)


Membership nsubj
gives ROOT
the det
ICC compound
jurisdiction dobj
over prep
alleged amod
crimes pobj
committed acl
in prep
Palestinian amod
territories pobj
since prep
last amod
June pobj
. punct
Israel nsubj
and cc
the det
United compound
States conj
opposed ROOT
the det
move dobj
, punct
which nsubj
could aux
open relcl
the det
door dobj
to prep
war compound
crimes compound
investigations pobj
against prep
Israelis pobj
. punct


'Subjects: Membership,Israel,which. Predicate: gives,opposed. Object: jurisdiction,crimes,territories,June,move,door,investigations,Israelis. Summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .'