In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from collections import Counter
import regex as re
from datasets import load_dataset
from nltk.tokenize import word_tokenize
import numpy as np
import json

In [2]:
data = load_dataset("Salesforce/wikitext", 'wikitext-103-raw-v1', trust_remote_code=True)
texts = [item["text"] for item in data['train']]
dataset = []
for text in texts:
    dataset.extend(word_tokenize(text))
dataset = dataset[:50000000]
print(len(dataset))

50000000


In [3]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [4]:
dataset = np.array(dataset)
tagged_dataset = pos_tag(dataset)
tagged_dataset

[('=', 'JJ'),
 ('Valkyria', 'NNP'),
 ('Chronicles', 'NNP'),
 ('III', 'NNP'),
 ('=', 'NNP'),
 ('Senjō', 'NNP'),
 ('no', 'DT'),
 ('Valkyria', 'NNP'),
 ('3', 'CD'),
 (':', ':'),
 ('Unrecorded', 'VBN'),
 ('Chronicles', 'NNP'),
 ('(', '('),
 ('Japanese', 'JJ'),
 (':', ':'),
 ('戦場のヴァルキュリア3', 'NN'),
 (',', ','),
 ('lit', 'NN'),
 ('.', '.'),
 ('Valkyria', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Battlefield', 'NNP'),
 ('3', 'CD'),
 (')', ')'),
 (',', ','),
 ('commonly', 'RB'),
 ('referred', 'VBD'),
 ('to', 'TO'),
 ('as', 'IN'),
 ('Valkyria', 'NNP'),
 ('Chronicles', 'NNP'),
 ('III', 'NNP'),
 ('outside', 'IN'),
 ('Japan', 'NNP'),
 (',', ','),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('tactical', 'JJ'),
 ('role', 'NN'),
 ('@', 'SYM'),
 ('-', ':'),
 ('@', 'NN'),
 ('playing', 'VBG'),
 ('video', 'JJ'),
 ('game', 'NN'),
 ('developed', 'VBN'),
 ('by', 'IN'),
 ('Sega', 'NNP'),
 ('and', 'CC'),
 ('Media.Vision', 'NNP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('PlayStation', 'NNP'),
 ('Portable', 'NNP'),
 ('.', '.'),
 (

In [5]:
lemmatizer = WordNetLemmatizer()
dataset = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in tagged_dataset]
print(dataset[5000:5100])

['the', 'governor', 'agree', 'to', 'three', 'provision', ':', 'The', 'governor', 'would', 'take', 'possession', 'of', 'the', 'arsenal', 'in', 'the', 'name', 'of', 'the', 'United', 'States', '.', 'The', 'soldier', 'would', 'be', 'allow', 'safe', 'passage', 'in', 'any', 'direction', 'carry', 'any', 'personal', 'and', 'public', 'property', 'besides', 'munition', 'of', 'war', '.', 'The', 'soldier', 'would', 'be', 'allow', 'to', 'march', 'away', 'a', 'men', 'leave', 'under', 'order', ',', 'not', 'a', 'conquer', 'and', 'surrender', 'soldier', '.', 'On', 'the', 'morning', 'of', 'February', '8', ',', '1861', ',', 'Rector', 'and', 'Totten', 'sign', 'an', 'agreement', 'place', 'the', 'arsenal', 'in', 'the', 'hand', 'of', 'state', 'official', '.', 'That', 'afternoon', ',', 'the', 'citizen', 'militia', 'march', 'to', 'the', 'arsenal']


In [6]:
verb_counts = Counter([word for i, word in enumerate(dataset) if tagged_dataset[i][1].startswith('V') and re.match(r'^[a-zA-Z]+$', word)])
most_common_verbs = list(zip(*verb_counts.most_common(50)))[0]

In [18]:
verb_counts

Counter({'be': 1254523,
         'have': 303680,
         'make': 65781,
         'include': 62390,
         'use': 54574,
         'do': 53249,
         'become': 51174,
         'take': 50655,
         'write': 39816,
         'say': 39421,
         'begin': 35516,
         'give': 34700,
         'know': 32130,
         'play': 32126,
         'find': 31777,
         'call': 30306,
         'go': 27409,
         'release': 27050,
         'win': 26159,
         'lead': 25546,
         'receive': 25472,
         'follow': 25114,
         'leave': 24454,
         'come': 24204,
         'remain': 21561,
         'continue': 21171,
         'describe': 21150,
         'move': 21037,
         'work': 20969,
         'hold': 20390,
         'appear': 19166,
         'return': 19157,
         'see': 18987,
         'build': 18548,
         'produce': 17660,
         'create': 17350,
         'reach': 17269,
         'provide': 17041,
         'base': 17006,
         'allow': 16849,
      

In [19]:
most_common_verbs

('be',
 'have',
 'make',
 'include',
 'use',
 'do',
 'become',
 'take',
 'write',
 'say',
 'begin',
 'give',
 'know',
 'play',
 'find',
 'call',
 'go',
 'release',
 'win',
 'lead',
 'receive',
 'follow',
 'leave',
 'come',
 'remain',
 'continue',
 'describe',
 'move',
 'work',
 'hold',
 'appear',
 'return',
 'see',
 'build',
 'produce',
 'create',
 'reach',
 'provide',
 'base',
 'allow',
 'name',
 'serve',
 'run',
 'show',
 'consider',
 'feature',
 'state',
 'start',
 'get',
 'record')

In [7]:
counts = {verb: dict() for verb in most_common_verbs}
for i in range(len(dataset)):
    if dataset[i] in most_common_verbs:
        nouns = counts[dataset[i]]
        next_word = dataset[i+1]
        if next_word.isalpha() and tagged_dataset[i+1][1].startswith('N'):
            nouns[dataset[i+1]] = nouns.get(dataset[i+1], 0) + 1
for verb in counts:
    counts[verb] = dict(sorted(counts[verb].items(), key=lambda x: x[1], reverse=True)[:50])

In [13]:
for verb in counts:
    print(verb)

be
have
make
include
use
do
become
take
write
say
begin
give
know
play
find
call
go
release
win
lead
receive
follow
leave
come
remain
continue
describe
move
work
hold
appear
return
see
build
produce
create
reach
provide
base
allow
name
serve
run
show
consider
feature
state
start
get
record


In [8]:
for verb in counts:
    print(verb, counts[verb])

be {'part': 3139, 'something': 518, 'evidence': 353, 'nothing': 351, 'member': 309, 'today': 264, 'briefly': 251, 'home': 249, 'time': 219, 'John': 196, 'kind': 151, 'friend': 127, 'president': 118, 'anything': 106, 'report': 102, 'concern': 99, 'people': 89, 'number': 88, 'William': 87, 'speculation': 85, 'plan': 84, 'fond': 80, 'US': 79, 'reminiscent': 77, 'chairman': 76, 'James': 71, 'proud': 70, 'prone': 67, 'Robert': 66, 'water': 65, 'example': 63, 'Hurricane': 63, 'woman': 60, 'unaware': 60, 'New': 59, 'Grade': 59, 'Michael': 59, 'sunk': 58, 'sort': 57, 'Australia': 57, 'Latin': 57, 'Captain': 56, 'President': 56, 'child': 56, 'men': 55, 'God': 54, 'hand': 54, 'St': 53, 'darker': 52, 'Richard': 52}
have {'difficulty': 361, 'sex': 333, 'access': 310, 'nothing': 248, 'trouble': 241, 'child': 148, 'something': 140, 'time': 133, 'problem': 116, 'fun': 81, 'plan': 78, 'anything': 77, 'power': 68, 'experience': 63, 'someone': 59, 'feeling': 50, 'success': 49, 'control': 48, 'briefly': 

In [10]:
with open('data.json', 'w') as f:
    json.dump(counts, f)

In [15]:
def intersections(verb1, verb2):
    return set(counts[verb1].keys()) & set(counts[verb2].keys())

In [21]:
for verb1 in counts:
    for verb2 in counts:
        if verb1 != verb2:
            print(verb1, verb2, intersections(verb1, verb2))

be have {'something', 'briefly', 'nothing', 'time', 'child', 'plan', 'people', 'concern', 'anything'}
be make {'part', 'plan', 'people', 'something', 'friend'}
be include {'St', 'part', 'New', 'William', 'Robert', 'member', 'James', 'Captain', 'Richard', 'woman', 'John', 'Michael', 'Australia', 'child', 'President'}
be use {'part', 'today', 'hand', 'water', 'time'}
be do {'nothing', 'today', 'James', 'kind', 'God', 'something', 'anything'}
be become {'president', 'home', 'New', 'part', 'member', 'US', 'President', 'chairman', 'something', 'friend'}
be take {'home', 'time', 'part'}
be write {'evidence', 'home', 'something', 'anything'}
be say {'New', 'nothing', 'John', 'Michael', 'something', 'anything'}
be begin {'plan'}
be give {'evidence', 'woman', 'time', 'number', 'people'}
be know {'example', 'nothing', 'today', 'God', 'people', 'something', 'anything'}
be play {'home', 'time', 'part', 'something'}
be find {'evidence', 'nothing', 'today', 'water', 'time', 'God', 'people', 'somethi