[Reference](https://medium.com/@chyun55555/text-analytics-in-python-text-preprocessing-and-feature-vectorization-e04a3e89aefc)

In [1]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
sample_text = "A gallery of Lionel Messi celebrating Argentina's World Cup win has become the most-liked Instagram post ever. \
Hours after posting it, the footballer received more than 65 million likes - and the number is constantly rising. \
Argentina defeated France on penalties in Sunday's final in Qatar - their first World Cup triumph in 36 years."

print('-----sample text-----\n', sample_text, '\n')

sentences = sent_tokenize(text = sample_text)

print('-----tokenized sentences-----\n', sentences)
print(type(sentences), len(sentences))

-----sample text-----
 A gallery of Lionel Messi celebrating Argentina's World Cup win has become the most-liked Instagram post ever. Hours after posting it, the footballer received more than 65 million likes - and the number is constantly rising. Argentina defeated France on penalties in Sunday's final in Qatar - their first World Cup triumph in 36 years. 

-----tokenized sentences-----
 ["A gallery of Lionel Messi celebrating Argentina's World Cup win has become the most-liked Instagram post ever.", 'Hours after posting it, the footballer received more than 65 million likes - and the number is constantly rising.', "Argentina defeated France on penalties in Sunday's final in Qatar - their first World Cup triumph in 36 years."]
<class 'list'> 3


In [3]:
from nltk import word_tokenize

sentence = "A gallery of Lionel Messi celebrating Argentina's World Cup win has become the most-liked Instagram post ever."
words = word_tokenize(sentence)
print(words)

['A', 'gallery', 'of', 'Lionel', 'Messi', 'celebrating', 'Argentina', "'s", 'World', 'Cup', 'win', 'has', 'become', 'the', 'most-liked', 'Instagram', 'post', 'ever', '.']


In [4]:
def tokenize(document):
    sentences = sent_tokenize(document)
    words = [word_tokenize(i) for i in sentences]
    return words

print(tokenize(sample_text))

[['A', 'gallery', 'of', 'Lionel', 'Messi', 'celebrating', 'Argentina', "'s", 'World', 'Cup', 'win', 'has', 'become', 'the', 'most-liked', 'Instagram', 'post', 'ever', '.'], ['Hours', 'after', 'posting', 'it', ',', 'the', 'footballer', 'received', 'more', 'than', '65', 'million', 'likes', '-', 'and', 'the', 'number', 'is', 'constantly', 'rising', '.'], ['Argentina', 'defeated', 'France', 'on', 'penalties', 'in', 'Sunday', "'s", 'final', 'in', 'Qatar', '-', 'their', 'first', 'World', 'Cup', 'triumph', 'in', '36', 'years', '.']]


In [5]:
"""Download stopwords from NLTK"""
import nltk
nltk.download('stopwords')

print('-----Examples of Stopwords in English-----\n', nltk.corpus.stopwords.words('english')[:10], '\n')
print(f"There are {len(nltk.corpus.stopwords.words('english'))} number of stopwords in English\n\n")

print('-----Examples of Stopwords in English-----\n', nltk.corpus.stopwords.words('spanish')[:10], '\n')
print(f"There are {len(nltk.corpus.stopwords.words('spanish'))} number of stopwords in Spanish")

-----Examples of Stopwords in English-----
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"] 

There are 179 number of stopwords in English


-----Examples of Stopwords in English-----
 ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se'] 

There are 313 number of stopwords in Spanish


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
text = "Its invasion in February managed to startle in every way. To those who thought Moscow was sane enough to not attempt such a massive and foolhardy undertaking. To those who felt the Russian military would waltz across a land of 40 million people and switch to clean-up operations within 10 days. And to those who felt they had the technical and intelligence prowess to do more than just randomly bombard civilian areas with ageing artillery; that the Kremlin’s military had evolved from the 90s levelling of Grozny in Chechnya."
stopwords = nltk.corpus.stopwords.words('english') #stopwords

all_tokens = []
words = tokenize(text) #tokenize() is the function we created previously

for sentence in words:
    for word in sentence:
        if not word.lower() in stopwords: #word.lower() because all stopwords are in lowercase
            all_tokens.append(word.lower())
            
print(all_tokens)

['invasion', 'february', 'managed', 'startle', 'every', 'way', '.', 'thought', 'moscow', 'sane', 'enough', 'attempt', 'massive', 'foolhardy', 'undertaking', '.', 'felt', 'russian', 'military', 'would', 'waltz', 'across', 'land', '40', 'million', 'people', 'switch', 'clean-up', 'operations', 'within', '10', 'days', '.', 'felt', 'technical', 'intelligence', 'prowess', 'randomly', 'bombard', 'civilian', 'areas', 'ageing', 'artillery', ';', 'kremlin', '’', 'military', 'evolved', '90s', 'levelling', 'grozny', 'chechnya', '.']


In [7]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

print(stemmer.stem('printing'), stemmer.stem('printer'), stemmer.stem('printed'))
print(stemmer.stem('debating'), stemmer.stem('debates'), stemmer.stem('debated'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('earlier'), stemmer.stem('earliest'))

print print print
deb deb deb
happy happiest
ear earliest


In [8]:
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
lem = WordNetLemmatizer()

print(lem.lemmatize('debating', 'v'), lem.lemmatize('debating', 'n'))
print(lem.lemmatize('earliest', 'a'), lem.lemmatize('earlier', 'a'))

debate debating
early early


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

text = ['hello world, welcome to the world of python', 'python is world', 'python is difficult', 'python is not difficult at all', 'i do not agree']
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)
count_array = count_matrix.toarray()
count_df = pd.DataFrame(count_array, columns = cv.get_feature_names_out())

count_df

Unnamed: 0,agree,all,at,difficult,do,hello,is,not,of,python,the,to,welcome,world
0,0,0,0,0,0,1,0,0,1,1,1,1,1,2
1,0,0,0,0,0,0,1,0,0,1,0,0,0,1
2,0,0,0,1,0,0,1,0,0,1,0,0,0,0
3,0,1,1,1,0,0,1,1,0,1,0,0,0,0
4,1,0,0,0,1,0,0,1,0,0,0,0,0,0


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer


text = ['hello world, welcome to the world of python', 'python is world', 'python is difficult', 'python is not difficult at all', 'i do not agree']
tfidf = TfidfVectorizer()
tfidf_array = tfidf.fit_transform(text).toarray()
tfidf_df = pd.DataFrame(tfidf_array, columns = tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,agree,all,at,difficult,do,hello,is,not,of,python,the,to,welcome,world
0,0.0,0.0,0.0,0.0,0.0,0.355311,0.0,0.0,0.355311,0.200176,0.355311,0.355311,0.355311,0.573325
1,0.0,0.0,0.0,0.0,0.0,0.0,0.562638,0.0,0.0,0.473309,0.0,0.0,0.0,0.677803
2,0.0,0.0,0.0,0.677803,0.0,0.0,0.562638,0.0,0.0,0.473309,0.0,0.0,0.0,0.0
3,0.0,0.495819,0.495819,0.400024,0.0,0.0,0.332056,0.400024,0.0,0.279336,0.0,0.0,0.0,0.0
4,0.614189,0.0,0.0,0.0,0.614189,0.0,0.0,0.495524,0.0,0.0,0.0,0.0,0.0,0.0
