<a href="https://colab.research.google.com/github/DanaJian/Machine-learning-technologies/blob/main/Natural_language_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
import requests
import collections
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [80]:
data_url = requests.get('https://www.gutenberg.org/files/11/11-0.txt')
text = data_url.content.decode('utf-8')
text[:500]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this eBook or online at\r\nwww.gutenberg.org. If you are not located in the United States, you\r\nwill have to check the laws of the country where you are'

1. ПРЕДОБРАБОТКА ТЕКСТА

In [81]:
# Нормализация заголовков глав
chapter_titles = [
    'CHAPTER I.     Down the Rabbit-Hole',
    'CHAPTER II.    The Pool of Tears',
    'CHAPTER III.   A Caucus-Race and a Long Tale',
    'CHAPTER IV.    The Rabbit Sends in a Little Bill',
    'CHAPTER V.     Advice from a Caterpillar',
    'CHAPTER VI.    Pig and Pepper',
    'CHAPTER VII.   A Mad Tea-Party',
    'CHAPTER VIII.  The Queen’s Croquet-Ground',
    'CHAPTER IX.    The Mock Turtle’s Story',
    'CHAPTER X.     The Lobster Quadrille',
    'CHAPTER XI.    Who Stole the Tarts?',
    'CHAPTER XII.   Alice’s Evidence']

for i in range(len(chapter_titles)):
    title = re.sub(r'[^\w\s_]', '', chapter_titles[i].lower().strip()).replace('_', '')
    chapter_titles[i] = re.sub(' +', ' ', title)
    
chapter_titles

['chapter i down the rabbithole',
 'chapter ii the pool of tears',
 'chapter iii a caucusrace and a long tale',
 'chapter iv the rabbit sends in a little bill',
 'chapter v advice from a caterpillar',
 'chapter vi pig and pepper',
 'chapter vii a mad teaparty',
 'chapter viii the queens croquetground',
 'chapter ix the mock turtles story',
 'chapter x the lobster quadrille',
 'chapter xi who stole the tarts',
 'chapter xii alices evidence']

In [82]:
# Нормализация текста
text = text.replace('\r', ' ').replace('\n', ' ')
text = re.sub(r'[^\w\s_]', '', text.lower().strip()).replace('_', '')
text = re.sub(' +', ' ', text)

text[:500]

'the project gutenberg ebook of alices adventures in wonderland by lewis carroll this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at wwwgutenbergorg if you are not located in the united states you will have to check the laws of the country where you are located before us'

2. РАЗДЕЛЕНИЕ ТЕКСТА НА ГЛАВЫ

In [83]:
start_index = text.rfind(chapter_titles[0])
end_index = text.rfind('the end')
text = text[start_index:end_index].strip()

chapters = []
i = 0

while i < len(chapter_titles):
    start = text.find(chapter_titles[i])
    if start != -1:
        end = text.find(chapter_titles[i + 1]) if i < len(chapter_titles) - 1 else len(text)
        chapter = text[start:end].strip()
        chapters.append(chapter)
        text = text[end:]
    else:
        i += 1

3. НАХОЖДЕНИЕ ТОП 20-СЛОВ ПО ГЛАВЕ ПРИ ПОМОЩИ МЕТОДА TF-IDF

In [84]:
# Лемматизация текста и удаление стоп-слов
for i, chapter in enumerate(chapters):
  words = word_tokenize(chapter)
  stop_words = stopwords.words('english')

  filtered_words = list(filter(lambda word: word not in stop_words, words))

  lemmatizer = WordNetLemmatizer()
  stemmed = [lemmatizer.lemmatize(word) for word in filtered_words]
  porter = PorterStemmer()
  stemmed = [porter.stem(word) for word in filtered_words]

  chapters[i] = ' '.join(stemmed)

In [85]:
chapters_tfidfs = []

for chapter in chapters:
  tfidfv = TfidfVectorizer()
  tfidf = tfidfv.fit_transform([chapter]).toarray()
  features = tfidfv.get_feature_names_out()
  chapters_tfidfs.append((features, np.squeeze(tfidf)))

In [86]:
for i, chapter in enumerate(chapters_tfidfs):
  features, tfidf = chapter
  df = pd.DataFrame({'TF-IDF': tfidf}, index=features)
  df.index.name = 'Слово'
  df = df.sort_values('TF-IDF', ascending=False)
  print(f'Глава {i+1}:\n', df.head(20))
  print('\n')

Глава 1:
            TF-IDF
Слово            
alic     0.417817
littl    0.223831
like     0.179065
think    0.164143
way      0.164143
see      0.149220
get      0.149220
door     0.134298
one      0.134298
said     0.119376
wonder   0.119376
could    0.119376
go       0.119376
eat      0.119376
say      0.119376
thought  0.119376
time     0.119376
tri      0.119376
look     0.104454
well     0.104454


Глава 2:
            TF-IDF
Слово            
alic     0.372762
littl    0.243729
mous     0.229392
im       0.186381
said     0.172044
dear     0.157707
go       0.157707
like     0.143370
thing    0.143370
cri      0.143370
went     0.129033
oh       0.129033
thought  0.129033
must     0.129033
way      0.114696
cat      0.114696
time     0.114696
feet     0.114696
pool     0.114696
one      0.114696


Глава 3:
             TF-IDF
Слово             
said      0.518194
alic      0.350543
mous      0.304820
know      0.182892
dodo      0.182892
one       0.121928
soon      0.106687
bir

4. НАХОЖДЕНИЕ ТОП 20-СЛОВ ПО ГЛАВЕ ПРИ ПОМОЩИ МЕТОДА LDA

In [87]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(chapters)

lda = LatentDirichletAllocation(n_components=len(chapters))
lda.fit(X)

feature_names = vectorizer.get_feature_names_out()

for i, topic in enumerate(lda.components_):
    print(f"Глава {i+1}:")
    print("\n".join([feature_names[i] for i in topic.argsort()[:-21:-1]]))
    print()

Глава 1:
caterpillar
said
pigeon
serpent
im
size
tri
well
ive
minut
old
egg
youth
bit
chang
mouth
father
hookah
alic
last

Глава 2:
pictur
love
school
signifi
lap
return
rumbl
foolish
asham
latin
romeno
poki
faceand
hundr
harm
nile
bath
boot
heard
fast

Глава 3:
said
king
alic
hatter
look
one
queen
court
would
rabbit
white
juri
littl
dormous
voic
could
know
began
must
read

Глава 4:
alic
littl
like
way
think
see
get
door
one
wonder
time
tri
say
eat
go
thought
could
said
noth
found

Глава 5:
pictur
love
school
signifi
lap
return
rumbl
foolish
asham
latin
romeno
poki
faceand
hundr
harm
nile
bath
boot
heard
fast

Глава 6:
pictur
love
school
signifi
lap
return
rumbl
foolish
asham
latin
romeno
poki
faceand
hundr
harm
nile
bath
boot
heard
fast

Глава 7:
said
alic
mous
know
littl
dear
must
one
im
thing
dodo
say
like
seem
cri
ill
oh
thought
would
could

Глава 8:
pictur
love
school
signifi
lap
return
rumbl
foolish
asham
latin
romeno
poki
faceand
hundr
harm
nile
bath
boot
heard
fast

Глава 9:
al