In [168]:
!pip install datasets nltk spacy -q

In [169]:
from datasets import Dataset
import pandas as pd
import spacy
import nltk
from nltk import pos_tag, word_tokenize, download
from spacy import displacy

In [170]:
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




In [171]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [172]:
with open("/rainbow_valley.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [173]:
text = nlp(text)
text = [i.text for i in text.sents][2]
text

'The sea moaned eerily on the sand-bar, sorrowful even in\nspring, but a sly, jovial wind came piping down the red harbour road\nalong which Miss Cornelia’s comfortable, matronly figure was making its\nway towards the village of Glen St. Mary.'

In [174]:
def nltk_pos_tagging(text):
    tokens = word_tokenize(text) #https://www.nltk.org/_modules/nltk/tokenize.html#word_tokenize
    return pos_tag(tokens) #https://www.nltk.org/_modules/nltk/tag.html#pos_tag

In [175]:
text

'The sea moaned eerily on the sand-bar, sorrowful even in\nspring, but a sly, jovial wind came piping down the red harbour road\nalong which Miss Cornelia’s comfortable, matronly figure was making its\nway towards the village of Glen St. Mary.'

In [176]:
nltk_pos_results = nltk_pos_tagging(text)

In [177]:
nltk_pos_results

[('The', 'DT'),
 ('sea', 'NN'),
 ('moaned', 'VBD'),
 ('eerily', 'RB'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('sand-bar', 'NN'),
 (',', ','),
 ('sorrowful', 'JJ'),
 ('even', 'RB'),
 ('in', 'IN'),
 ('spring', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('a', 'DT'),
 ('sly', 'RB'),
 (',', ','),
 ('jovial', 'JJ'),
 ('wind', 'NN'),
 ('came', 'VBD'),
 ('piping', 'VBG'),
 ('down', 'RP'),
 ('the', 'DT'),
 ('red', 'JJ'),
 ('harbour', 'NN'),
 ('road', 'NN'),
 ('along', 'IN'),
 ('which', 'WDT'),
 ('Miss', 'NNP'),
 ('Cornelia', 'NNP'),
 ('’', 'NNP'),
 ('s', 'NN'),
 ('comfortable', 'NN'),
 (',', ','),
 ('matronly', 'RB'),
 ('figure', 'NN'),
 ('was', 'VBD'),
 ('making', 'VBG'),
 ('its', 'PRP$'),
 ('way', 'NN'),
 ('towards', 'IN'),
 ('the', 'DT'),
 ('village', 'NN'),
 ('of', 'IN'),
 ('Glen', 'NNP'),
 ('St.', 'NNP'),
 ('Mary', 'NNP'),
 ('.', '.')]

In [178]:
def spacy_annotation(text):
  doc = nlp(text)
  # Инициализация списков для хранения результатов
  pos_tags = []
  named_entities = []
  for token in doc:
    pos_tags = (token.text, token.pos_)
  for ent in doc.ents:
    named_entities = (ent.text, ent.label_)
  dependencies = [(token.text, token.dep_, token.head.text) for token in doc] #https://spacy.io/usage/linguistic-features/
  return {
      "pos_tags": pos_tags,
      "named_entities": named_entities,
      "dependencies": dependencies
  }

In [179]:
spacy_results = spacy_annotation(text)

In [182]:
texts = list()

In [183]:
texts.append(text)

In [184]:
data = []

for i, text in enumerate(texts):
  data.append({
      "text": text,
      "nltk_pos": nltk_pos_tagging(text),
      "spacy_pos": spacy_annotation(text)["pos_tags"],
      "named_entities": spacy_annotation(text)["named_entities"],
      "dependencies": spacy_annotation(text)["dependencies"]
    })



In [185]:
df = pd.DataFrame(data)
df

Unnamed: 0,text,nltk_pos,spacy_pos,named_entities,dependencies
0,"The sea moaned eerily on the sand-bar, sorrowf...","[(The, DT), (sea, NN), (moaned, VBD), (eerily,...","(., PUNCT)","(Glen St. Mary, PERSON)","[(The, det, sea), (sea, nsubj, moaned), (moane..."


In [186]:

example_text = texts[0]
doc = nlp(example_text)
displacy.render(doc, style="dep", jupyter=True, options={"distance": 120})

In [187]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [194]:
dataset.push_to_hub("15karina/dataset", token='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')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CommitInfo(commit_url='https://huggingface.co/datasets/15karina/dataset/commit/52d1a43270990f54363eb811a0a4b2ee667aac23', commit_message='Upload dataset', commit_description='', oid='52d1a43270990f54363eb811a0a4b2ee667aac23', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/15karina/dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='15karina/dataset'), pr_revision=None, pr_num=None)