In [1]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Text processing in Python (continued)
- [NLTK](http://www.nltk.org/book/)
- gensim
- word2vec
- scikit-learn

In [2]:
import nltk
from nltk.corpus import inaugural

We have imported the `inaugural` corpus, now let's print all the files that are in it

In [3]:
inaugural.fileids()

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

We get words(tokens) from `1789-Washington.txt` file in inaugural speech

In [4]:
inaugural.words('1789-Washington.txt')

['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...]

In [5]:
len(inaugural.words('1789-Washington.txt'))

1538

Now let's get the sentences from Washington's inaugural speech, immediately broken down into tokens 

In [6]:
inaugural.sents('1789-Washington.txt')

[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]

Let's break down the corpus of inaugural speeches from different years into objects. Each object will contain data about a single inaugural speech: president`s name, year and text of the speech

In [7]:
texts = []
for fileid in inaugural.fileids():
    year = fileid[:4]
    name = fileid[5:].split('.')[0]
    text = ' '.join(inaugural.words(fileid))
    texts.append({'name': name, 'year': year, 'text': text})

In [8]:
texts[0:3]

[{'name': 'Washington',
  'year': '1789',
  'text': 'Fellow - Citizens of the Senate and of the House of Representatives : Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order , and received on the 14th day of the present month . On the one hand , I was summoned by my Country , whose voice I can never hear but with veneration and love , from a retreat which I had chosen with the fondest predilection , and , in my flattering hopes , with an immutable decision , as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination , and of frequent interruptions in my health to the gradual waste committed on it by time . On the other hand , the magnitude and difficulty of the trust to which the voice of my country called me , being sufficient to awaken in the wisest and most experienced of her c

## Tokenization

In [9]:
text = texts[0]['text']

We tokenize the text, breaking it down into sentences.

In [10]:
sentences = nltk.sent_tokenize(text)

Then we have all the sentences of the first inaugural speech saved in the `sentences` array

In [11]:
sentences[:5]

['Fellow - Citizens of the Senate and of the House of Representatives : Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order , and received on the 14th day of the present month .',
 'On the one hand , I was summoned by my Country , whose voice I can never hear but with veneration and love , from a retreat which I had chosen with the fondest predilection , and , in my flattering hopes , with an immutable decision , as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination , and of frequent interruptions in my health to the gradual waste committed on it by time .',
 'On the other hand , the magnitude and difficulty of the trust to which the voice of my country called me , being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qua

In [12]:
sent = sentences[0]

We do the same now, breaking down the first sentence into tokens 

In [13]:
tokens = nltk.word_tokenize(sent)

In [14]:
tokens

['Fellow',
 '-',
 'Citizens',
 'of',
 'the',
 'Senate',
 'and',
 'of',
 'the',
 'House',
 'of',
 'Representatives',
 ':',
 'Among',
 'the',
 'vicissitudes',
 'incident',
 'to',
 'life',
 'no',
 'event',
 'could',
 'have',
 'filled',
 'me',
 'with',
 'greater',
 'anxieties',
 'than',
 'that',
 'of',
 'which',
 'the',
 'notification',
 'was',
 'transmitted',
 'by',
 'your',
 'order',
 ',',
 'and',
 'received',
 'on',
 'the',
 '14th',
 'day',
 'of',
 'the',
 'present',
 'month',
 '.']

## Stemming

Stemming returns the roots of words. Example in Slovak: *ryba -> ryb*.

In [15]:
porter = nltk.PorterStemmer()

In [16]:
[porter.stem(token) for token in tokens]

['fellow',
 '-',
 'citizen',
 'of',
 'the',
 'senat',
 'and',
 'of',
 'the',
 'hous',
 'of',
 'repres',
 ':',
 'among',
 'the',
 'vicissitud',
 'incid',
 'to',
 'life',
 'no',
 'event',
 'could',
 'have',
 'fill',
 'me',
 'with',
 'greater',
 'anxieti',
 'than',
 'that',
 'of',
 'which',
 'the',
 'notif',
 'wa',
 'transmit',
 'by',
 'your',
 'order',
 ',',
 'and',
 'receiv',
 'on',
 'the',
 '14th',
 'day',
 'of',
 'the',
 'present',
 'month',
 '.']

## Lemmatization

Lemmatization converts words to their basic dictionary form. Example in Slovk: *rybe -> ryba*.

In [17]:
wnl = nltk.WordNetLemmatizer()

In [18]:
[wnl.lemmatize(token) for token in tokens]

['Fellow',
 '-',
 'Citizens',
 'of',
 'the',
 'Senate',
 'and',
 'of',
 'the',
 'House',
 'of',
 'Representatives',
 ':',
 'Among',
 'the',
 'vicissitude',
 'incident',
 'to',
 'life',
 'no',
 'event',
 'could',
 'have',
 'filled',
 'me',
 'with',
 'greater',
 'anxiety',
 'than',
 'that',
 'of',
 'which',
 'the',
 'notification',
 'wa',
 'transmitted',
 'by',
 'your',
 'order',
 ',',
 'and',
 'received',
 'on',
 'the',
 '14th',
 'day',
 'of',
 'the',
 'present',
 'month',
 '.']

## Part-of-Speech Tagging (POS)

Assign a grammatical category to each word

In [19]:
tagged = nltk.pos_tag(tokens)
tagged

[('Fellow', 'NNP'),
 ('-', ':'),
 ('Citizens', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Senate', 'NNP'),
 ('and', 'CC'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('House', 'NNP'),
 ('of', 'IN'),
 ('Representatives', 'NNPS'),
 (':', ':'),
 ('Among', 'IN'),
 ('the', 'DT'),
 ('vicissitudes', 'NNS'),
 ('incident', 'NN'),
 ('to', 'TO'),
 ('life', 'NN'),
 ('no', 'DT'),
 ('event', 'NN'),
 ('could', 'MD'),
 ('have', 'VB'),
 ('filled', 'VBN'),
 ('me', 'PRP'),
 ('with', 'IN'),
 ('greater', 'JJR'),
 ('anxieties', 'NNS'),
 ('than', 'IN'),
 ('that', 'DT'),
 ('of', 'IN'),
 ('which', 'WDT'),
 ('the', 'DT'),
 ('notification', 'NN'),
 ('was', 'VBD'),
 ('transmitted', 'VBN'),
 ('by', 'IN'),
 ('your', 'PRP$'),
 ('order', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('received', 'VBD'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('14th', 'JJ'),
 ('day', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('present', 'JJ'),
 ('month', 'NN'),
 ('.', '.')]

In [20]:
nltk.help.upenn_tagset('IN')

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...


In [21]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


## Name entities

Named Entity Recognition (NER) helps identify important entities like people, organizations, and locations in text. For this purpose we can use the ne_chunk function_ which processes POS-tagged words to extract this information. This improves text analysis for tasks like categorization and information retrieval.

In [22]:
entities = nltk.chunk.ne_chunk(tagged)

We can build a tree that shows the hierarchy of the detected entities 

In [23]:
print(entities.__repr__())

Tree('S', [Tree('GPE', [('Fellow', 'NNP')]), ('-', ':'), ('Citizens', 'NNS'), ('of', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('Senate', 'NNP')]), ('and', 'CC'), ('of', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('House', 'NNP')]), ('of', 'IN'), ('Representatives', 'NNPS'), (':', ':'), ('Among', 'IN'), ('the', 'DT'), ('vicissitudes', 'NNS'), ('incident', 'NN'), ('to', 'TO'), ('life', 'NN'), ('no', 'DT'), ('event', 'NN'), ('could', 'MD'), ('have', 'VB'), ('filled', 'VBN'), ('me', 'PRP'), ('with', 'IN'), ('greater', 'JJR'), ('anxieties', 'NNS'), ('than', 'IN'), ('that', 'DT'), ('of', 'IN'), ('which', 'WDT'), ('the', 'DT'), ('notification', 'NN'), ('was', 'VBD'), ('transmitted', 'VBN'), ('by', 'IN'), ('your', 'PRP$'), ('order', 'NN'), (',', ','), ('and', 'CC'), ('received', 'VBD'), ('on', 'IN'), ('the', 'DT'), ('14th', 'JJ'), ('day', 'NN'), ('of', 'IN'), ('the', 'DT'), ('present', 'JJ'), ('month', 'NN'), ('.', '.')])


## N-grams

An n-gram is a contiguous sequence of n items from a given text or speech data. It is widely used in Natural Language Processing (NLP) to analyze patterns, predict text, and improve machine learning models

In [24]:
tokens = nltk.word_tokenize(text)

In [25]:
bigrams = list(nltk.bigrams(tokens))
bigrams[:5]

[('Fellow', '-'),
 ('-', 'Citizens'),
 ('Citizens', 'of'),
 ('of', 'the'),
 ('the', 'Senate')]

As you can see, to see the most frequent connections, we are prevented by stop words, so let's remove them

In [26]:
nltk.FreqDist(bigrams).most_common(10)

[(('of', 'the'), 20),
 ((',', 'and'), 15),
 (('to', 'the'), 11),
 (('in', 'the'), 9),
 ((',', 'I'), 7),
 ((',', 'in'), 7),
 (('which', 'the'), 6),
 (('which', 'I'), 6),
 (('by', 'the'), 6),
 (('for', 'the'), 6)]

In [27]:
stopwords = nltk.corpus.stopwords.words('english')

In [28]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

Leave only the tokens that are not in the stopwords array

In [29]:
tokens_cleared = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords]

In [30]:
tokens_cleared[:10]

['fellow',
 'citizens',
 'senate',
 'house',
 'representatives',
 'among',
 'vicissitudes',
 'incident',
 'life',
 'event']

First of all, we try to analyse a pair of 2 words - bigrams

In [31]:
nltk.FreqDist(nltk.bigrams(tokens_cleared)).most_common(10)

[(('fellow', 'citizens'), 3),
 (('house', 'representatives'), 2),
 (('united', 'states'), 2),
 (('good', 'assure'), 2),
 (('free', 'government'), 2),
 (('executive', 'department'), 2),
 (('american', 'people'), 2),
 (('public', 'good'), 2),
 (('citizens', 'senate'), 1),
 (('senate', 'house'), 1)]

Then, we try to analyse a pair of 3 words - trigrams

In [32]:
nltk.FreqDist(nltk.trigrams(tokens_cleared)).most_common(10)

[(('fellow', 'citizens', 'senate'), 1),
 (('citizens', 'senate', 'house'), 1),
 (('senate', 'house', 'representatives'), 1),
 (('house', 'representatives', 'among'), 1),
 (('representatives', 'among', 'vicissitudes'), 1),
 (('among', 'vicissitudes', 'incident'), 1),
 (('vicissitudes', 'incident', 'life'), 1),
 (('incident', 'life', 'event'), 1),
 (('life', 'event', 'could'), 1),
 (('event', 'could', 'filled'), 1)]

## WordNet

* Lexical database
* Contains synsets: nouns, verbs, adjectives, adverbs
* Connections between synsets: antonyms, hyperonyms, hyponyms, holonyms, meronyms

 Synset is a group of synonymous words that share the same meaning

In [33]:
from nltk.corpus import wordnet as wn

We fetch all the synsets (synonym sets) of the word "car" from WordNet. Each synset represents a different definition of the word "car"

In [34]:
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

We select the first synset, which is the most common meaning of "car"

In [35]:
car = wn.synset('car.n.01')

We can retrieve all the synonyms (lemmas) for the given synset (car.n.01)

In [36]:
car.lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

Also we can return the definition of the selected synset (car.n.01)

In [37]:
car.definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

We can get example sentences where this synset is used

In [38]:
car.examples()

['he needs a car to get to work']

Starting from the meaning in our synonym set, we can find hyponyms (more specific words that fall under "car")

In [39]:
car.hyponyms()

[Synset('minicar.n.01'),
 Synset('compact.n.03'),
 Synset('hot_rod.n.01'),
 Synset('cruiser.n.01'),
 Synset('hatchback.n.01'),
 Synset('sedan.n.01'),
 Synset('stock_car.n.01'),
 Synset('sports_car.n.01'),
 Synset('hardtop.n.01'),
 Synset('model_t.n.01'),
 Synset('cab.n.03'),
 Synset('minivan.n.01'),
 Synset('racer.n.02'),
 Synset('limousine.n.01'),
 Synset('used-car.n.01'),
 Synset('bus.n.04'),
 Synset('sport_utility.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('ambulance.n.01'),
 Synset('roadster.n.01'),
 Synset('convertible.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('coupe.n.01'),
 Synset('pace_car.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('jeep.n.01'),
 Synset('electric.n.01'),
 Synset('loaner.n.02')]

It represents a "parent category" in a hierarchical relationship.

In [40]:
car.hypernyms()

[Synset('motor_vehicle.n.01')]

We can return part meronyms of a given synset. Meronyms represent a "part-of" relationship, meaning they list things that are components or parts of the whole

In [41]:
car.part_meronyms()

[Synset('air_bag.n.01'),
 Synset('automobile_engine.n.01'),
 Synset('hood.n.09'),
 Synset('luggage_compartment.n.01'),
 Synset('roof.n.02'),
 Synset('gasoline_engine.n.01'),
 Synset('auto_accessory.n.01'),
 Synset('sunroof.n.01'),
 Synset('automobile_horn.n.01'),
 Synset('rear_window.n.01'),
 Synset('buffer.n.06'),
 Synset('fender.n.01'),
 Synset('glove_compartment.n.01'),
 Synset('floorboard.n.02'),
 Synset('car_window.n.01'),
 Synset('grille.n.02'),
 Synset('accelerator.n.01'),
 Synset('car_mirror.n.01'),
 Synset('first_gear.n.01'),
 Synset('stabilizer_bar.n.01'),
 Synset('bumper.n.02'),
 Synset('car_door.n.01'),
 Synset('reverse.n.02'),
 Synset('car_seat.n.01'),
 Synset('high_gear.n.01'),
 Synset('window.n.02'),
 Synset('tail_fin.n.02'),
 Synset('third_gear.n.01'),
 Synset('running_board.n.01')]

In [42]:
wn.synsets('black')[0].lemmas()[0].antonyms()

[Lemma('white.n.02.white')]

# Text Representation

A text document is usually represented using a bag-of-words (BoW) model, which is a vector. The components of the vector represent individual words or n-grams from a dictionary (for the entire corpus/language). The values ​​of the vector components can be:

* count
* frequency
* weighted frequency

Words with high frequency in a language (such as conjunctions) are referred to as *stop words* and are often removed during preprocessing.

## TF-IDF
* Term frequency * inverse document frequency
* `TF` – frequency of a word in the current document
* `IDF` – negative logarithm of the probability of a word occurring in a document (same for all documents)
* Various variants (weighting schemes): https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## Gensim
- A library for modeling topics in documents.
- Implements TF-IDF, LSA, pLSA, LDA, HDP, DTM, word2vec
- https://radimrehurek.com/gensim/tutorial.html

In [43]:
from gensim import corpora, models, similarities

In [44]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    return [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords]

In [45]:
tokenized_docs = [preprocess_text(text['text']) for text in texts]

In [46]:
tokenized_docs[4][:10]

['proceeding',
 'fellow',
 'citizens',
 'qualification',
 'constitution',
 'requires',
 'entrance',
 'charge',
 'conferred',
 'duty']

Removing words that occur only once in the corpus

In [47]:
from collections import defaultdict

frequency = defaultdict(int)
for text in tokenized_docs:
    for token in text:
        frequency[token] += 1

tokenized_docs = [[token for token in doc if frequency[token] > 1] for doc in tokenized_docs]

Creating a dictionary from a collection of tokenized documents. Each unique word in the tokenized_docs is assigned a unique ID. The dictionary is later used for converting text into a numerical format (e.g., bag-of-words representation)

In [48]:
dictionary = corpora.Dictionary(tokenized_docs)

In [49]:
# print(dictionary.token2id)

Converting tokenized text into a numerical representation using the Bag-of-Words (BoW) model

In [50]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

Training the TF-IDF model

In [51]:
tfidf_model = models.TfidfModel(corpus)

In [52]:
tfidf_corpus = tfidf_model[corpus]

In [53]:
tfidf_corpus[0][:10]

[(0, 0.02865772524274559),
 (1, 0.07082134338297177),
 (2, 0.047831914834149725),
 (3, 0.014862430544438632),
 (4, 0.041811914404857924),
 (5, 0.027307199915801005),
 (6, 0.05631662889391485),
 (7, 0.06233662932320665),
 (8, 0.02560497085348817),
 (9, 0.03714243930251071)]

Other models: LSI, LDA, ...

We can calculate the similarity of the resulting vectors:

In [54]:
index = similarities.MatrixSimilarity(tfidf_corpus)

In [55]:
index[tfidf_corpus[0]]

array([1.        , 0.05286995, 0.12033835, 0.10489108, 0.09024273,
       0.11507139, 0.08539192, 0.14273158, 0.11627034, 0.13620742,
       0.12669681, 0.10682183, 0.13267624, 0.16245481, 0.11774192,
       0.12089311, 0.12477458, 0.12077793, 0.09366583, 0.0424006 ,
       0.06105308, 0.06304477, 0.11607063, 0.0729711 , 0.10656489,
       0.0944657 , 0.08286408, 0.09186366, 0.06875983, 0.02735389,
       0.09153646, 0.06827688, 0.07756818, 0.0684045 , 0.0914682 ,
       0.06663257, 0.0597512 , 0.04235245, 0.05028749, 0.03209055,
       0.04543332, 0.07307167, 0.03767864, 0.06221583, 0.04202975,
       0.04385568, 0.04855515, 0.03685588, 0.04443101, 0.04047035,
       0.04416719, 0.04352387, 0.04177937, 0.04510492, 0.04515488,
       0.0661387 , 0.04182338, 0.0243195 , 0.04833949], dtype=float32)

## word2vec

Each word has a learned vector of real numbers that represent its various properties and capture several linguistic regularities. We can count the similarity between words as the similarity of two vectors.

vector('Paris') - vector('France') + vector('Italy') ~= vector('Rome')

vector('king') - vector('man') + vector('woman') ~= vector('queen')

https://radimrehurek.com/gensim/models/word2vec.html

https://medium.com/@mishra.thedeepak/word2vec-in-minutes-gensim-nlp-python-6940f4e00980

In [56]:
from nltk.corpus import brown

In [57]:
sentences = brown.sents()
model = models.Word2Vec(sentences, min_count=1)

In [58]:
model.save('brown_model')

In [59]:
model = models.Word2Vec.load('brown_model')

Retrieving words that are most similar to "mother" based on the trained word vector space

In [60]:
print(model.wv.most_similar("mother"))

[('father', 0.980160653591156), ('husband', 0.9684260487556458), ('wife', 0.945469081401825), ('son', 0.9287609457969666), ('friend', 0.915546715259552), ('nickname', 0.9138074517250061), ('voice', 0.9070308804512024), ('brother', 0.8937575817108154), ('addiction', 0.885729968547821), ('patient', 0.8836824297904968)]


Finding the word that least belongs to a given set based on word vector similarity

In [61]:
print(model.wv.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


In [62]:
print(model.wv.doesnt_match("pizza pasta garden fries".split()))

garden


Retrieving the vector representation of the word "human"

In [63]:
model.wv['human']

array([-0.507504  ,  0.33320343,  0.5284809 ,  0.52977157, -0.5455529 ,
       -0.48120877,  1.057019  ,  1.3028897 , -0.5378181 , -0.6858985 ,
       -0.01705402, -0.58223695,  0.507998  , -1.0840155 ,  0.18427342,
       -0.63021845,  0.20032336, -0.10306896, -0.7359981 , -1.0622157 ,
        0.4540688 ,  0.1313434 ,  0.7138239 ,  0.21376345, -0.2552546 ,
        0.08095742,  0.0123677 ,  0.02750621, -0.86284757,  0.06020001,
        0.4214964 , -0.41749412,  1.0922868 , -0.44855723,  0.09932699,
        0.37436983,  0.01289282, -1.0066997 , -0.24552816,  0.12656598,
        0.19874714, -0.42525837,  0.77346265, -0.00488648,  0.622342  ,
        0.25764716, -0.47221577, -0.16058224, -0.32392594,  0.24162896,
        0.22696379, -0.47384137, -0.67867714, -0.39819637, -0.81504816,
       -0.69508755,  1.2054706 , -0.10652367, -0.3895447 ,  0.23924072,
        0.17667039,  0.24526155, -0.01167543, -0.5120434 , -0.8495882 ,
        1.0171554 ,  0.32333955,  0.97309923, -1.1331737 ,  0.52

## Feature extraction using scikit-learn

https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

Extracting texts into the list docs

In [65]:
docs = [text['text'] for text in texts]

Learning the vocabulary from docs. Converting each document into a sparse matrix representation

In [66]:
vectorizer = CountVectorizer(stop_words='english')
tf = vectorizer.fit_transform(docs)

Converting the sparse matrix to an array. Each index represents a word from the vocabulary

In [67]:
tf.toarray()[0][:100]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

Retrieving the list of words (features) in the vocabulary

In [68]:
vectorizer.get_feature_names_out()[:100]

array(['000', '100', '108', '11', '120', '125', '13', '14th', '15th',
       '16', '1774', '1776', '1778', '1780', '1787', '1789', '1790',
       '1800', '1801', '1812', '1815', '1816', '1817', '1818', '1826',
       '1850', '1861', '1863', '1868', '1873', '1880', '1886', '1890',
       '1893', '1896', '1897', '1898', '1899', '18th', '1907', '1917',
       '1933', '1941', '1945', '1963', '1972', '1980', '1984', '19th',
       '20', '200', '200th', '2017', '20th', '21st', '225', '25', '30',
       '30th', '3d', '40', '400', '41', '48', '4th', '50', '50th', '60',
       '67', '6th', 'abandon', 'abandoned', 'abandonment', 'abate',
       'abdicated', 'abeyance', 'abhorring', 'abide', 'abiding',
       'abilities', 'ability', 'abject', 'able', 'ably', 'abnormal',
       'abode', 'abodes', 'abolish', 'abolished', 'abolishing',
       'aboriginal', 'aborigines', 'abound', 'abounding', 'abounds',
       'abraham', 'abreast', 'abridging', 'abroad', 'absence'],
      dtype=object)

Converting text data into a numerical format using the TF-IDF (Term Frequency-Inverse Document Frequency) representation. This is an improvement over the Bag-of-Words (BoW) model, as it assigns importance weights to words instead of just counting occurrences

In [69]:
transformer = TfidfVectorizer(stop_words='english')
tfidf = transformer.fit_transform(docs)

In [70]:
tfidf.toarray()[0][:100]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05754421, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     