In [1]:
import re

with open('alice_in_wonderland.txt', 'r') as f:
    text = f.read()

with open ('stopwords.txt','r') as f:
    sw = f.read()

In [2]:
# Очищаем текст от знаков и стоп-слов
def remove_punctuation(text):

    return re.sub(r'[^\w\s]', ' ', text)
     
def remove_sw(text, sw):
    stopwords = sw.split() # разбейте строку со стоп-словами на список слов
    words = text.split() # разбейте текст из переменной text на слова
    filtered_words = []
    
    for word in words:
        if word.lower() not in stopwords: 
            filtered_words.append(word)   
    cleaned_text = ' '.join(filtered_words) # объедините слова из списка filtered_words в строку с join
    
    return cleaned_text



In [3]:
remove_punctuation('This is a sample text! Wow :-)')

'This is a sample text  Wow    '

In [4]:
remove_sw('Below is the text without stopwords', sw)

'text without stopwords'

In [5]:
# Токенизация 
def tokenization(text):
    text = text.lower()
    tokenized_text = text.split('\n')
    
    return tokenized_text

In [6]:
tokenization('Tokenize this text\nWe\'ve got to tokenize it')

['tokenize this text', "we've got to tokenize it"]

In [7]:
# Применяем фуннкцию токенизации к данным 
tokenized_data = tokenization(text)
cleaned_data = []

for element in tokenized_data:
    element = remove_punctuation(element)
    element = remove_sw(element, sw)
    
    cleaned_data.append(element)

In [8]:
print(type(tokenized_data))
print(len(tokenized_data))
print(tokenized_data[0][:100])
print(tokenized_data[1][:100])

<class 'list'>
3600
alice's adventures in wonderland



In [9]:
formatted_texts = []
for text in cleaned_data:
  if len(text) >0:
    formatted_text = f'<SoS>{text.strip()}<EoS>'
    formatted_texts.append(formatted_text)
        
formatted_texts[:10]  

['<SoS>alice s adventures wonderland<EoS>',
 '<SoS>alice s adventures wonderland<EoS>',
 '<SoS>lewis carroll<EoS>',
 '<SoS>millennium fulcrum edition 3 0<EoS>',
 '<SoS>chapter<EoS>',
 '<SoS>rabbit hole<EoS>',
 '<SoS>alice beginning get tired sitting sister<EoS>',
 '<SoS>bank nothing twice<EoS>',
 '<SoS>peeped book sister reading<EoS>',
 '<SoS>pictures conversations use book<EoS>']

In [10]:
with open('aliceinwonderland_gpt2.txt','w') as f:
    f.write(('\n'.join(formatted_texts)))

In [11]:
# Лингвичтисечкий анализ 
import random

filtered_data = [] # если длина текста из cleaned_data > 0, добавляем его в список filtered_data
for text in cleaned_data:
    if len(text) > 0:
        filtered_data.append(text)

sample_size = max(1, int(len(filtered_data) * 0.1))  # задаем размер выборки: 10%
sample_data = random.sample(filtered_data, sample_size) # семплирование
sample_text = ' '.join((sample_data))

In [12]:
print(sample_text)



In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_text)

lemmas = []
for token in doc:
    lemmas.append(token.lemma_)

for i in range(30):
    print(doc[i], lemmas[i])
... # выведите пары (token, lemma) в диапазоне 30,
# т.е. для первых 30 элементов списка lemmas и для первых 30 элементов doc

In [24]:
pos_annotation = []
for token in doc:
    pos_annotation.append((token.text, token.pos_))

for i in range(30):
    print(f'{pos_annotation[i][0]} -> {pos_annotation[i][1]}')

chance -> NOUN
began -> VERB
looking -> VERB
everything -> PRON
trumpet -> NOUN
one -> NUM
hand -> NOUN
scroll -> NOUN
parchment -> VERB
exactly -> ADV
nothing -> PRON
happened -> VERB
use -> NOUN
now -> ADV
thought -> VERB
poor -> ADJ
alice -> NOUN
pretend -> VERB
two -> NUM
people -> NOUN
melancholy -> ADJ
tone -> NOUN
nobody -> PRON
seems -> VERB
like -> SCONJ
m -> PROPN
reply -> NOUN
fear -> NOUN
forget -> VERB
end -> VERB


In [25]:
# Синтаксичсекая разметка 
from spacy import displacy # строим синтаксическое дерево 

def get_random_sentence(): # пишем функцию для разбора рандомного предложения
  return random.choice(cleaned_data)

random_sentence = get_random_sentence()
doc_ = nlp(random_sentence)

syntax_annotation = [(token.text, token.dep_, token.head.text) for token in doc_]
displacy.render(doc_, style="dep", jupyter=True, options={
    "distance": 120,  # Adjust the distance between nodes
    "font": "Arial",  # Set the font
    "color": "white",  # Set the color
    "bg": "rgb(0, 50, 0)"   # Set the background color
})

In [26]:
# Разметка NER
ner_annotation = [(ent.text, ent.label_) for ent in doc.ents]
displacy.render(doc, style="ent", jupyter=True)

In [55]:
from pprint import pprint
results = dict()
results['Original Text'] = [doc[:10].text]
results['Lemmatized Text'] = [lemmas[:10]]
results['POS Annotation	'] = [pos_annotation[:10]]
results['ner'] = [ner_annotation[:10]]
pprint(results)

{'Lemmatized Text': [['chance',
                      'begin',
                      'look',
                      'everything',
                      'trumpet',
                      'one',
                      'hand',
                      'scroll',
                      'parchment',
                      'exactly']],
 'Original Text': ['chance began looking everything trumpet one hand scroll '
                   'parchment exactly'],
 'POS Annotation\t': [[('chance', 'NOUN'),
                       ('began', 'VERB'),
                       ('looking', 'VERB'),
                       ('everything', 'PRON'),
                       ('trumpet', 'NOUN'),
                       ('one', 'NUM'),
                       ('hand', 'NOUN'),
                       ('scroll', 'NOUN'),
                       ('parchment', 'VERB'),
                       ('exactly', 'ADV')]],
 'ner': [[('two', 'CARDINAL'),
          ('one', 'CARDINAL'),
          ('three inches', 'QUANTITY'),
          ('one', 'CAR

In [56]:
import pandas as pd

df = pd.DataFrame(results) # создаем датафрейм с результатами
df.head()



Unnamed: 0,Original Text,Lemmatized Text,POS Annotation\t,ner
0,chance began looking everything trumpet one ha...,"[chance, begin, look, everything, trumpet, one...","[(chance, NOUN), (began, VERB), (looking, VERB...","[(two, CARDINAL), (one, CARDINAL), (three inch..."


In [35]:
df.to_csv('nlp_results.csv', index=False)

In [31]:
from datasets import Dataset   

In [62]:
alice_dataset = Dataset.from_dict(results)

In [63]:
alice_dataset

Dataset({
    features: ['Original Text', 'Lemmatized Text', 'POS Annotation\t', 'ner'],
    num_rows: 1
})

In [70]:
from huggingface_hub import create_repo
create_repo("Evgy23/Alice_dataset")

RepoUrl('https://huggingface.co/Evgy23/Alice_dataset', endpoint='https://huggingface.co', repo_type='model', repo_id='Evgy23/Alice_dataset')

In [71]:
ds.push_to_hub("Evgy23/Alice_dataset")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Evgy23/Alice_dataset/commit/9ea7a6bcdad521cb68d2b11c9b923eb873e4b945', commit_message='Upload dataset', commit_description='', oid='9ea7a6bcdad521cb68d2b11c9b923eb873e4b945', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Evgy23/Alice_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Evgy23/Alice_dataset'), pr_revision=None, pr_num=None)

In [74]:
from datasets import load_dataset

ds = load_dataset("Evgy23/Alice_dataset")

ds['train']['ner'][0][:10]

[['two', 'CARDINAL'],
 ['one', 'CARDINAL'],
 ['three inches', 'QUANTITY'],
 ['one', 'CARDINAL'],
 ['earls mercia northumbria', 'PERSON'],
 ['first', 'ORDINAL'],
 ['three', 'CARDINAL'],
 ['next day', 'DATE'],
 ['maybe footman', 'TIME'],
 ['stays year', 'DATE']]