In [87]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cluster import KMeans

## Обработка текста

In [3]:
corpus = 'When we were in Paris we visited a lot of museums. We first went to the Louvre, the largest art museum in the world. I have always been interested in art so I spent many hours there. The museum is enourmous, so a week there would not be enough.'

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Detr1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
sentences = sent_tokenize(corpus)
sentences

['When we were in Paris we visited a lot of museums.',
 'We first went to the Louvre, the largest art museum in the world.',
 'I have always been interested in art so I spent many hours there.',
 'The museum is enourmous, so a week there would not be enough.']

In [7]:
print(word_tokenize(sentences[0]))

['When', 'we', 'were', 'in', 'Paris', 'we', 'visited', 'a', 'lot', 'of', 'museums', '.']


In [8]:
tokens = []
for sentence in sentences:
    t = word_tokenize(sentence)
    tokens.extend(t)
print(tokens)

['When', 'we', 'were', 'in', 'Paris', 'we', 'visited', 'a', 'lot', 'of', 'museums', '.', 'We', 'first', 'went', 'to', 'the', 'Louvre', ',', 'the', 'largest', 'art', 'museum', 'in', 'the', 'world', '.', 'I', 'have', 'always', 'been', 'interested', 'in', 'art', 'so', 'I', 'spent', 'many', 'hours', 'there', '.', 'The', 'museum', 'is', 'enourmous', ',', 'so', 'a', 'week', 'there', 'would', 'not', 'be', 'enough', '.']


In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Detr1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
unique_stops = set(stopwords.words('english'))
no_stops = []
for token in tokens:
    token = token.lower()
    if token not in unique_stops and token.isalpha():
        no_stops.append(token)
print(no_stops)

['paris', 'visited', 'lot', 'museums', 'first', 'went', 'louvre', 'largest', 'art', 'museum', 'world', 'always', 'interested', 'art', 'spent', 'many', 'hours', 'museum', 'enourmous', 'week', 'would', 'enough']


In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Detr1\AppData\Roaming\nltk_data...


True

In [18]:
lemmatizer = WordNetLemmatizer()
lemmatized = []
for token in no_stops:
    token = lemmatizer.lemmatize(token)
    lemmatized.append(token)
print(lemmatized)

['paris', 'visited', 'lot', 'museum', 'first', 'went', 'louvre', 'largest', 'art', 'museum', 'world', 'always', 'interested', 'art', 'spent', 'many', 'hour', 'museum', 'enourmous', 'week', 'would', 'enough']


In [20]:
porter = PorterStemmer()
stemmed_p = [porter.stem(s) for s in lemmatized]
print(stemmed_p)

['pari', 'visit', 'lot', 'museum', 'first', 'went', 'louvr', 'largest', 'art', 'museum', 'world', 'alway', 'interest', 'art', 'spent', 'mani', 'hour', 'museum', 'enourm', 'week', 'would', 'enough']


In [22]:
lancaster = LancasterStemmer()
stemmed_l = [lancaster.stem(s) for s in lemmatized]
print(stemmed_l)

['par', 'visit', 'lot', 'muse', 'first', 'went', 'louvr', 'largest', 'art', 'muse', 'world', 'alway', 'interest', 'art', 'spent', 'many', 'hour', 'muse', 'enourm', 'week', 'would', 'enough']


In [26]:
bow_counter = Counter(lemmatized)
print(bow_counter)

Counter({'museum': 3, 'art': 2, 'paris': 1, 'visited': 1, 'lot': 1, 'first': 1, 'went': 1, 'louvre': 1, 'largest': 1, 'world': 1, 'always': 1, 'interested': 1, 'spent': 1, 'many': 1, 'hour': 1, 'enourmous': 1, 'week': 1, 'would': 1, 'enough': 1})


In [27]:
print(bow_counter.most_common(10))

[('museum', 3), ('art', 2), ('paris', 1), ('visited', 1), ('lot', 1), ('first', 1), ('went', 1), ('louvre', 1), ('largest', 1), ('world', 1)]


In [31]:
vectorizer = CountVectorizer(analyzer = "word",
                            lowercase = True,
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = ['english'],
                            max_features = 5000)

In [32]:
bow_cv = vectorizer.fit_transform(sentences)
print(type(bow_cv))

<class 'scipy.sparse.csr.csr_matrix'>


In [33]:
print(bow_cv.toarray())

[[0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 2 0 0 1 1 0 0]
 [0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 3 0 1 0 1 0 1 0 0 1 0]
 [1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1]]


In [34]:
bow_cv.shape

(4, 34)

In [35]:
vocab = vectorizer.vocabulary_
print(vocab)

{'when': 31, 'we': 27, 'were': 30, 'in': 9, 'paris': 20, 'visited': 26, 'lot': 13, 'of': 19, 'museums': 17, 'first': 6, 'went': 29, 'to': 25, 'the': 23, 'louvre': 14, 'largest': 12, 'art': 1, 'museum': 16, 'world': 32, 'have': 7, 'always': 0, 'been': 3, 'interested': 10, 'so': 21, 'spent': 22, 'many': 15, 'hours': 8, 'there': 24, 'is': 11, 'enourmous': 5, 'week': 28, 'would': 33, 'not': 18, 'be': 2, 'enough': 4}


In [36]:
tokens = vectorizer.get_feature_names_out()
tokens

array(['always', 'art', 'be', 'been', 'enough', 'enourmous', 'first',
       'have', 'hours', 'in', 'interested', 'is', 'largest', 'lot',
       'louvre', 'many', 'museum', 'museums', 'not', 'of', 'paris', 'so',
       'spent', 'the', 'there', 'to', 'visited', 'we', 'week', 'went',
       'were', 'when', 'world', 'would'], dtype=object)

In [37]:
index_list = []
for i, _ in enumerate(bow_cv):
    index_list.append(f'Sentence_{i}')
bow_cv_df = pd.DataFrame(data = bow_cv.toarray(),
                        index = index_list,
                        columns = tokens)
bow_cv_df

Unnamed: 0,always,art,be,been,enough,enourmous,first,have,hours,in,...,there,to,visited,we,week,went,were,when,world,would
Sentence_0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,2,0,0,1,1,0,0
Sentence_1,0,1,0,0,0,0,1,0,0,1,...,0,1,0,1,0,1,0,0,1,0
Sentence_2,1,1,0,1,0,0,0,1,1,1,...,1,0,0,0,0,0,0,0,0,0
Sentence_3,0,0,1,0,1,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1


In [38]:
bow_cv

<4x34 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [40]:
tfidf_trans = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_trans.fit(bow_cv)
df_idf = pd.DataFrame(tfidf_trans.idf_, index = tokens, columns = ["idf_weights"])

In [41]:
tf_idf_vector = tfidf_trans.transform(bow_cv)
tf_idf_vector

<4x34 sparse matrix of type '<class 'numpy.float64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [46]:
df_tfidf = pd.DataFrame(tf_idf_vector.toarray(), 
                        columns = vectorizer.get_feature_names_out())
print(df_tfidf.T)

                   0         1         2         3
always      0.000000  0.000000  0.328404  0.000000
art         0.000000  0.211724  0.258918  0.000000
be          0.000000  0.000000  0.000000  0.324676
been        0.000000  0.000000  0.328404  0.000000
enough      0.000000  0.000000  0.000000  0.324676
enourmous   0.000000  0.000000  0.000000  0.324676
first       0.000000  0.268544  0.000000  0.000000
have        0.000000  0.000000  0.328404  0.000000
hours       0.000000  0.000000  0.328404  0.000000
in          0.202925  0.171408  0.209616  0.000000
interested  0.000000  0.000000  0.328404  0.000000
is          0.000000  0.000000  0.000000  0.324676
largest     0.000000  0.268544  0.000000  0.000000
lot         0.317921  0.000000  0.000000  0.000000
louvre      0.000000  0.268544  0.000000  0.000000
many        0.000000  0.000000  0.328404  0.000000
museum      0.000000  0.211724  0.000000  0.255978
museums     0.317921  0.000000  0.000000  0.000000
not         0.000000  0.000000 

In [47]:
df_tfidf.T.shape

(34, 4)

In [49]:
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words = 'english')
tfIdf = tfIdfVectorizer.fit_transform(sentences)

In [50]:
print(tfIdfVectorizer.get_feature_names_out())

['art' 'enourmous' 'hours' 'interested' 'largest' 'lot' 'louvre' 'museum'
 'museums' 'paris' 'spent' 'visited' 'week' 'went' 'world']


In [51]:
tfIdfVectorizer.idf_

array([1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073])

In [52]:
tfIdf.shape

(4, 15)

In [53]:
df_tfidf = pd.DataFrame(tfIdf.toarray(), 
                        columns = tfIdfVectorizer.get_feature_names_out())
print(df_tfidf.T)

              0         1         2         3
art         0.0  0.344315  0.414289  0.000000
enourmous   0.0  0.000000  0.000000  0.617614
hours       0.0  0.000000  0.525473  0.000000
interested  0.0  0.000000  0.525473  0.000000
largest     0.0  0.436719  0.000000  0.000000
lot         0.5  0.000000  0.000000  0.000000
louvre      0.0  0.436719  0.000000  0.000000
museum      0.0  0.344315  0.000000  0.486934
museums     0.5  0.000000  0.000000  0.000000
paris       0.5  0.000000  0.000000  0.000000
spent       0.0  0.000000  0.525473  0.000000
visited     0.5  0.000000  0.000000  0.000000
week        0.0  0.000000  0.000000  0.617614
went        0.0  0.436719  0.000000  0.000000
world       0.0  0.436719  0.000000  0.000000


In [63]:
mean_weights = np.asarray(tfIdf.mean(axis=0)).ravel().tolist()
print(mean_weights)

[0.18965081782108964, 0.15440359274390048, 0.13136818731601646, 0.13136818731601646, 0.10917982746877804, 0.125, 0.10917982746877804, 0.2078121960479979, 0.125, 0.125, 0.13136818731601646, 0.125, 0.15440359274390048, 0.10917982746877804, 0.10917982746877804]


In [67]:
mean_weights_df = pd.DataFrame({'term': tfIdfVectorizer.get_feature_names_out(),
                               'mean_weights': mean_weights})
mean_weights_df.sort_values(by = 'mean_weights', ascending = False).reset_index(drop=True).head(10)

Unnamed: 0,term,mean_weights
0,museum,0.207812
1,art,0.189651
2,enourmous,0.154404
3,week,0.154404
4,hours,0.131368
5,interested,0.131368
6,spent,0.131368
7,lot,0.125
8,museums,0.125
9,paris,0.125


## Косинусное расстояние между текстовыми векторами

In [68]:
text1 = 'all the world’s a stage, and all the men and women merely players'
text2 = 'you must be the change you wish to see in the world'

In [69]:
corpus = [text1, text2]

In [73]:
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words = 'english')
X = tfIdfVectorizer.fit_transform(corpus)
print(X.toarray())

[[0.         0.4261596  0.4261596  0.4261596  0.4261596  0.
  0.4261596  0.30321606]
 [0.6316672  0.         0.         0.         0.         0.6316672
  0.         0.44943642]]


In [74]:
vectors_df = pd.DataFrame(data = X.toarray(), index = ['vector1', 'vector2'],
                         columns = tfIdfVectorizer.get_feature_names_out())
vectors_df

Unnamed: 0,change,men,merely,players,stage,wish,women,world
vector1,0.0,0.42616,0.42616,0.42616,0.42616,0.0,0.42616,0.303216
vector2,0.631667,0.0,0.0,0.0,0.0,0.631667,0.0,0.449436


In [75]:
vector1 = X.toarray()[0]
vector2 = X.toarray()[1]

In [76]:
numerator = np.dot(vector1, vector2)

In [77]:
vector1Len = np.linalg.norm(vector1)
vector2Len = np.linalg.norm(vector2)

In [78]:
denominator = vector1Len * vector2Len

In [80]:
cosine = numerator/denominator
cosine

0.13627634143908643

In [81]:
angle_radians = np.arccos(cosine)
angle_radians

1.4340946173847484

In [82]:
angle_degrees = angle_radians * 180/np.pi
round(angle_degrees, 2)

82.17

## Кластерный анализ текста

In [83]:
text = '''
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data.
It applies knowledge and actionable insights from data across a broad range of application domains.
Data science is related to data mining, machine learning and big data.
The Bolshoi Theatre is a historic theatre in Moscow, Russia.
It was originally designed by architect Joseph Bové, which holds ballet and opera performances.
Before the October Revolution it was a part of the Imperial Theatres of the Russian Empire along with Maly Theatre in Moscow and a few theatres in Saint Petersburg.
Data science is a concept to unify statistics, data analysis, informatics, and their related methods in order to understand and analyze actual phenomena with data.
However, data science is different from computer science and information science.
The main building of the theatre, rebuilt and renovated several times during its history, is a landmark of Moscow and Russia.
On 28 October 2011, the Bolshoi re-opened after an extensive six-year renovation.
'''

In [85]:
corpus = []
for line in text.split('\n'):
    if line:
        line = line.lower()
        corpus.append(line)
corpus

['data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data.',
 'it applies knowledge and actionable insights from data across a broad range of application domains.',
 'data science is related to data mining, machine learning and big data.',
 'the bolshoi theatre is a historic theatre in moscow, russia.',
 'it was originally designed by architect joseph bové, which holds ballet and opera performances.',
 'before the october revolution it was a part of the imperial theatres of the russian empire along with maly theatre in moscow and a few theatres in saint petersburg.',
 'data science is a concept to unify statistics, data analysis, informatics, and their related methods in order to understand and analyze actual phenomena with data.',
 'however, data science is different from computer science and information science.',
 'the main building of the theatre, rebuil

In [86]:
tfIdfVectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
X = tfIdfVectorizer.fit_transform(corpus)

In [88]:
kmeans = KMeans(n_clusters=2).fit(X)



In [89]:
prediction = ['Many statisticians, including Nate Silver, have argued that data science is not a new field, but rather another name for statistics.',
              'Urusov set up the theatre in collaboration with English tightrope walker Michael Maddox.',
              'Until the mid-1990s, most foreign operas were sung in Russian, but Italian and other languages have been heard more frequently on the Bolshoi stage in recent years.']

In [90]:
tfidf_prediction = tfIdfVectorizer.transform(prediction)

In [94]:
tfidf_prediction

<3x76 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [95]:
kmeans.predict(tfidf_prediction)

array([1, 0, 0])