In [1]:
import sys
import re
import numpy as np
import unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
text_data = ["  Interrobang. By Aishwarya Henriette    ",
             "Parking And Going. By Karl Gautier",
             "  Today Is the night. By Jarek Prakash   "]

In [3]:
strip_whitespase = [string.strip() for string in text_data]

In [4]:
remove_periods = [string.replace(".", "") for string in strip_whitespase]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is the night By Jarek Prakash']

In [5]:
def capitalizer(string: str) -> str:
    return string.upper()

In [6]:
[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [7]:
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

In [8]:
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

In [9]:
html = """
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>
       """

In [10]:
soup = BeautifulSoup(html, "html.parser")

In [11]:
soup.find("div", {"class" : "full_name"}).text.strip()

'Masego Azra'

In [12]:
text_data = ["Hi!!! I. Love. This. Song...",
             "10000% Agree!!! #LoveIT",
             "Right?!?!"]

In [13]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))

In [14]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

In [15]:
tokenized_words = "i am going to go to the store and park".split()

In [16]:
nltk.download("stopwords")
stop_words = stopwords.words("english")  # change argument of the language to switch language stop-word's corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
[word for word in tokenized_words if word not in stop_words]  # working only with lowercase

['going', 'go', 'store', 'park']

In [18]:
tokenized_words = ["go", "went", "gone", "am", "are", "is", "was", "were"]

In [19]:
lemmatizer = nltk.WordNetLemmatizer()
nltk.download("wordnet")
[lemmatizer.lemmatize(word, pos="v") for word in tokenized_words]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['go', 'go', 'go', 'be', 'be', 'be', 'be', 'be']

In [20]:
text_data = "Chris loved outdoor running"

In [21]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
text_tagged = nltk.pos_tag(nltk.word_tokenize(text_data))
text_tagged

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [22]:
text_data = np.array(["Бразилия - моя любовь. Бразилия!",
                      "Швеция - лучше",
                      "Германия бьёт обоих"])

In [23]:
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [24]:
bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [25]:
bag_of_words.toarray()

array([[2, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [26]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [27]:
feature_matrix.toarray()

array([[0.81649658, 0.        , 0.        , 0.        , 0.40824829,
        0.40824829, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.70710678],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.        ,
        0.        , 0.57735027, 0.        ]])

In [28]:
tfidf.vocabulary_

{'бразилия': 0,
 'моя': 5,
 'любовь': 4,
 'швеция': 7,
 'лучше': 3,
 'германия': 2,
 'бьёт': 1,
 'обоих': 6}