## Natural language processing

### Text preprocessing

In [12]:
pip install nltk



In [13]:
# take the source text for analysis
corpus = 'When we were in Paris we visited a lot of museums. We first went to the Louvre, the largest art museum in the world. I have always been interested in art so I spent many hours there. The museum is enourmous, so a week there would not be enough.'

In [14]:
# importing the main library for working with text
import nltk

# and other libraries already known to us
import pandas as pd
import numpy as np

#### Step 1. Splitting into sentences


In [15]:
# importing the sent_tokenize() function
from nltk.tokenize import sent_tokenize

# download the model that will divide the text into sentences
nltk.download('punkt')
print('')

# and apply the function to the text
sentences = sent_tokenize(corpus)
sentences




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


#### Step 2. Splitting into words

In [None]:
# importing the word_tokenize() function
from nltk.tokenize import word_tokenize

# and let's break down the first sentence into words
print(word_tokenize(sentences[0]))

In [None]:
# now let's do this with all the offers

# to do this, create an empty list
tokens = []

# in the for loop, we'll go through each sentence
for sentence in sentences:

    # creating lists of tokens
    t = word_tokenize(sentence)

    # and add the lists to each other
    tokens.extend(t)

print(tokens)

#### Step 3. Lowercase translation, removal of stop words and punctuation marks


In [None]:
# importing the stopword module
from nltk.corpus import stopwords

# download the dictionary of stop words
nltk.download('stopwords')

# we use set to leave only unique values
unique_stops = set(stopwords.words('english'))

# creating an empty list without stop words
no_stops = []

# going through all the tokens
for token in tokens:

    # we translate all words into lowercase
    token = token.lower()

    # если тоif the token is not in the list of stop words and is not a punctuation mark,
    if token not in unique_stops and token.isalpha():

        # adding it to the list
        no_stops.append(token)

print(no_stops)

#### Step 4. Lemmatization


In [None]:
# importing a class for lemmatization
from nltk.stem import WordNetLemmatizer

# importing the dictionary
nltk.download('wordnet')

# creating an object of this class
lemmatizer = WordNetLemmatizer()

# and an empty list for words after lemmatization
lemmatized = []

# going through all the tokens
for token in no_stops:

    # we apply lemmatization
    token = lemmatizer.lemmatize(token)

    # adding the word after lemmatization to the list
    lemmatized.append(token)

print(lemmatized)

#### Step 5. Stemming


In [None]:
# importing the Porter stemmer class and creating an object of this class
from nltk.stem import PorterStemmer
porter = PorterStemmer()

# we use list comprehension instead of the for loop for stemming and creating a new list
# this record is much shorter.
stemmed_p = [porter.stem(s) for s in lemmatized]
print(stemmed_p)

In [None]:
# similarly, we import the Lancaster class and create an object of this class
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()

# We also use list_comprehension
stemmed_l = [lancaster.stem(s) for s in lemmatized]
print(stemmed_l)

### Bag of words (bag of words, bow)

Using Counter


In [None]:
# importing the Counter class from the collections module
from collections import Counter

# applying the Counter class to words after lemmatization
# the output returns the dictionary { word : its frequency in the text }
bow_counter = Counter(lemmatized)
# print(bow_counter)


# the most_common() function orders the dictionary by value
# look at the top 10 most frequent words
print(bow_counter.most_common(10))

Using CountVectorizer


In [None]:
# importing the CountVectorizer class from the Scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer

# creating an object of this class and
# we indicate that we want to translate the words into lowercase, as well as
# filter out stop words using stop_words = 'english'
vectorizer = CountVectorizer(analyzer = "word",
                             lowercase = True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 5000)

In [None]:
# we apply this object to sentences (they also say documents)
bow_cv = vectorizer.fit_transform(sentences)

# the output is a csr matrix
print(type(bow_cv))

In [None]:
# convert the csr matrix to the familiar Numpy array format
# to do this, you can use .toarray()
print(bow_cv.toarray())

In [None]:
# rows are sentences (documents), columns are words (tokens)
bow_cv.shape

In [None]:
# we can look at the tokens (words) used

# Here, numbers are not a frequency, but simply an ordinal number (index).
vocab = vectorizer.vocabulary_
print(vocab)

In [None]:
# you can output words without an index
tokens = vectorizer.get_feature_names_out()
print(tokens)

In [None]:
# for convenience, we will convert the matrix into a dataframe

# first, let's create an index of proposals (documents)
index_list = []

# in the loop, we will go through the elements of the matrix, denoting them by '_'
# the enumerate function will set an index for each element, starting from 0
for i, _ in enumerate(bow_cv):

    # let's add an index to the word Sentence
    index_list.append(f'Sentence_{i}')

# print(index_list)

# now you can use pd.DataFrame()
bow_cv_df = pd.DataFrame(data = bow_cv.toarray(),
                         index = index_list,
                         columns = tokens)
bow_cv_df

### TF-IDF

#### Method 1. CountVectorizer + TfidfTransformer


1) Calculation of TF, term frequency, word frequency


In [None]:
# We have already completed this step above
bow_cv

2) Now you need to calculate the IDF

In [None]:
# importing TfidfTransformer (CountVectorizer has already been imported)
from sklearn.feature_extraction.text import TfidfTransformer

# creating an object of the Tf idf Transformer class
tfidf_trans = TfidfTransformer(smooth_idf = True, use_idf = True)

# and calculate the IDF of words
tfidf_trans.fit(bow_cv)

# putting the result in a dataframe
df_idf = pd.DataFrame(tfidf_trans.idf_, index = tokens, columns = ["idf_weights"])
#df_idf

3) That leaves TFxIDF

In [None]:
# calculate TF-IDF (in fact, multiply TF by IDF)
tf_idf_vector = tfidf_trans.transform(bow_cv)
tf_idf_vector

In [None]:
# Now we can look at the TF-IDF score for a specific word in a specific document

# to do this, we will convert the csr matrix into a regular Numpy array
df_tfidf = pd.DataFrame(tf_idf_vector.toarray(), columns = vectorizer.get_feature_names_out())

# and transpose it (write the columns as rows)
print(df_tfidf.T)

In [None]:
# let's see how many words this method has left after processing
df_tfidf.T.shape

#### Method 2. TfidfVectorizer

In [None]:
# importing the class TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# creating an object of the TfidfVectorizer class
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words= 'english')

# we immediately calculate the TF-IDF of words
tfIdf = tfIdfVectorizer.fit_transform(sentences)
tfIdf

In [None]:
# you can see which words are left after filtering
print(tfIdfVectorizer.get_feature_names_out())

In [None]:
# You can also view the IDF of the words
tfIdfVectorizer.idf_

In [None]:
# through a dataframe, we can link words and their IDF

df_idf = pd.DataFrame(tfIdfVectorizer.idf_, index = tfIdfVectorizer.get_feature_names_out(), columns = ['idf_weights'])
# df_idf.sort_values(by = 'idf_weights', ascending = False)

In [None]:
# number of sentences (documents) x number of words
tfIdf.shape

Calculating the TF-IDF value for each word in each text


In [None]:
# and finally, the very meaning of TF-IDF for a specific word in a specific document
# the more unique it is for a particular document, the higher the indicator
df_tfidf = pd.DataFrame(tfIdf.toarray(), columns = tfIdfVectorizer.get_feature_names_out())
print(df_tfidf.T)

Calculating the average TF-IDF value for each word across all texts

In [None]:
# calculate the arithmetic mean of the rows (axis = 0)
tfIdf.mean(axis = 0)

In [None]:
# converting the matrix to a Numpy array
np.asarray(tfIdf.mean(axis = 0))

In [None]:
# see how many dimensions there are
np.asarray(tfIdf.mean(axis = 0)).shape

In [None]:
# remove the second dimension
np.asarray(tfIdf.mean(axis = 0)).ravel()

In [None]:
# we look at the dimension again
np.asarray(tfIdf.mean(axis = 0)).ravel().shape

In [None]:
# convert it to a list
mean_weights = np.asarray(tfIdf.mean(axis = 0)).ravel().tolist()
mean_weights

In [None]:
# creating a dataframe from the dictionary
mean_weights_df = pd.DataFrame({'term': tfIdfVectorizer.get_feature_names_out(), 'mean_weights': mean_weights})

# we sort 10 words in descending order with the maximum average TF-IDF
mean_weights_df.sort_values(by = 'mean_weights', ascending = False).reset_index(drop = True).head(10)

The cosine distance between text vectors


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from scipy.sparse.csr import csr_matrix
# import numpy as np
# import pandas as pd

In [None]:
# let's take two texts (sentences) for simplicity
text1 = 'all the world’s a stage, and all the men and women merely players'
text2 = 'you must be the change you wish to see in the world'

# combine them into a case
corpus = [text1, text2]

# creating an object of the TfidfVectorizer class
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words = 'english')

# at the output, we get two vectors, where each value is the weight (tf-idf indicator) of the word
X = tfIdfVectorizer.fit_transform(corpus)

# converting the data to a Numpy array
print(X.toarray())

In [None]:
# for convenience, we can look at the weights in the dataframe format
vectors_df = pd.DataFrame(data = X.toarray(),
                          index = ['vector1', 'vector2'],
                          columns = tfIdfVectorizer.get_feature_names_out())
vectors_df

Let me remind you of the cosine distance formula
:

$$ \cos \theta ={\mathbf {a} \cdot \mathbf {b} \over \|\mathbf {a} \|\|\mathbf {b} \|} $$

In [None]:
# take the vectors separately
vector1 = X.toarray()[0]
vector2 = X.toarray()[1]

In [None]:
# first, we perform the operations in the numerator of the formula
numerator = np.dot(vector1, vector2)

In [None]:
# now let's take the denominator and
# (1) calculate the lengths (by and large, this is the Pythagorean theorem)
vector1Len = np.linalg.norm(vector1)
vector2Len = np.linalg.norm(vector2)

# (2) multiply them
denominator = vector1Len * vector2Len

In [None]:
# see what the cosine of the angle between the vectors is
cosine = numerator/denominator
cosine

In [None]:
# find the angle in degrees by its cosine
# to do this, first calculate the angle in radians
angle_radians = np.arccos(cosine)

# then in degrees
angle_degrees = angle_radians * 360/2/np.pi
round(angle_degrees, 2)

#### Cluster text analysis

In [None]:
# There are two topics in the text below: data science and the Bolshoi Theater (source: Wikipedia)
text = '''
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data.
It applies knowledge and actionable insights from data across a broad range of application domains.
Data science is related to data mining, machine learning and big data.
The Bolshoi Theatre is a historic theatre in Moscow, Russia.
It was originally designed by architect Joseph Bové, which holds ballet and opera performances.
Before the October Revolution it was a part of the Imperial Theatres of the Russian Empire along with Maly Theatre in Moscow and a few theatres in Saint Petersburg.
Data science is a concept to unify statistics, data analysis, informatics, and their related methods in order to understand and analyze actual phenomena with data.
However, data science is different from computer science and information science.
The main building of the theatre, rebuilt and renovated several times during its history, is a landmark of Moscow and Russia.
On 28 October 2011, the Bolshoi re-opened after an extensive six-year renovation.
'''

In [None]:
# creating a list of suggestions
corpus = []

# to do this, in the for loop, we will go through the text, dividing it by the newline \n
for line in text.split('\n'):

  # if the string is not empty (i.e. True)
  if line:

    # we translate it to lowercase
    line = line.lower()
    # and add it to the list
    corpus.append(line)

In [None]:
corpus

In [None]:
# TfidfVectorizer example
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words= 'english')

# at the output, we get sentence vectors
X = tfIdfVectorizer.fit_transform(corpus)
# print(X.toarray())

In [None]:
# importing the k-means algorithm from the sklearn library
from sklearn.cluster import KMeans

# since we know that there are two topics, we use the hyperparameter k = 2
kmeans = KMeans(n_clusters = 2, n_init = 10, random_state = 42).fit(X)

In [None]:
# take the new proposals, one from the field of Data Science and two about the Bolshoi Theater
prediction = ['Many statisticians, including Nate Silver, have argued that data science is not a new field, but rather another name for statistics.',
              'Urusov set up the theatre in collaboration with English tightrope walker Michael Maddox.',
              'Until the mid-1990s, most foreign operas were sung in Russian, but Italian and other languages have been heard more frequently on the Bolshoi stage in recent years.']

# let's apply two models, first we will create vectors of new sentences (TfidfVectorizer.transform),
# then we assign them to one of the clusters (kmeans.predict)
kmeans.predict(tfIdfVectorizer.transform(prediction))