# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [18]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: c:\Users\diego\OneDrive\Documentos\GitHub\data_analytics_zhaw\Week_11


## Defining documents

In [None]:
# Defining documents (=sentences)
d1 = 'The squirrel runs up the tree.'
d2 = 'Today i had an apple for breakfast.'
d3 = 'Both squirrels and apples live in trees.'
d4 = 'Squirrels dont eat apples as breakfast, but rather nuts'

corpus_01 = d1 + ' ' + d2 + ' ' + d3 + ' ' + d4
corpus_01

'The squirrel runs up the tree. Today i had an apple for breakfast. Both squirrels and apples live in trees. Squirrels dont eat apples as breakfast, but rather nuts'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [20]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the squirrel runs up the tree. today i had an apple for breakfast. both squirrels and apples live in trees. squirrels dont eat apples as breakfast, but rather nuts'

### Removing punctuation

In [21]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the squirrel runs up the tree today i had an apple for breakfast both squirrels and apples live in trees squirrels dont eat apples as breakfast but rather nuts'

### Tokenize text & removal of stopwords

In [22]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'for', 'mustn', 'that', 'to', 'this', 'or', 'no', 'hers', "you're", "you'll", 'how', 'weren', 'over', 'ourselves', 'more', 'mightn', 'being', 'too', 'now', "hadn't", 'own', 'her', 'any', "mustn't", 'him', 'below', 'few', 'am', 'about', 'not', 'd', 'hadn', "wasn't", 'again', "you'd", 'does', 'won', "wouldn't", 'are', 'from', 'when', 'where', 'and', 'of', 'in', 'very', "needn't", 'ain', 'yours', 'couldn', 'shan', 'on', 'during', 'an', 'hasn', 'my', 'm', 'why', 'before', 'aren', "isn't", 'he', 'those', 'these', 'their', "doesn't", 'doing', 'most', "weren't", 'between', 'we', "mightn't", 'off', 'who', 'themselves', 'be', 'yourself', 'into', 'by', 'ours', 'been', 've', 'were', 'such', 'each', 'further', 'myself', 'had', 'up', 'having', 'have', 'same', 'some', 'yourselves', "couldn't", 'until', 'do', "aren't", 'can', 'if', 'under', "should've", 'both', "hasn't", 'after', 's', 'while', 'only', "she's", 'its', 'me', 'so', 'a', 'with', 'them', 're', "you've", 'then',

In [23]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['squirrel', 'runs', 'tree', 'today', 'apple', 'breakfast', 'squirrels', 'apples', 'live', 'trees', 'squirrels', 'dont', 'eat', 'apples', 'breakfast', 'rather', 'nuts']

### Lemmatization

In [24]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['squirrel', 'runs', 'tree', 'today', 'apple', 'breakfast', 'squirrels', 'apples', 'live', 'trees', 'squirrels', 'dont', 'eat', 'apples', 'breakfast', 'rather', 'nuts'] 

After lemmatization:
['squirrel', 'run', 'tree', 'today', 'apple', 'breakfast', 'squirrels', 'apples', 'live', 'tree', 'squirrels', 'dont', 'eat', 'apples', 'breakfast', 'rather', 'nut']

## Redefine the text corpus (pre-processed)

In [33]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['squirrel run tree', 
          'today apple breakfast', 
          'squirrels apples live tree',
          'squirrels dont eat apples breakfast rather nut']

## Document-term matrix with ngram_range=(1,1)

In [34]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   apple  apples  breakfast  dont  eat  live  nut  rather  run  squirrel  \
0      0       0          0     0    0     0    0       0    1         1   
1      1       0          1     0    0     0    0       0    0         0   
2      0       1          0     0    0     1    0       0    0         0   
3      0       1          1     1    1     0    1       1    0         0   

   squirrels  today  tree  
0          0      0     1  
1          0      1     0  
2          1      0     1  
3          1      0     0  


## Document-term matrix with ngram_range=(2,2)

In [35]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   apple breakfast  apples breakfast  apples live  breakfast rather  dont eat  \
0                0                 0            0                 0         0   
1                1                 0            0                 0         0   
2                0                 0            1                 0         0   
3                0                 1            0                 1         1   

   eat apples  live tree  rather nut  run tree  squirrel run  \
0           0          0           0         1             1   
1           0          0           0         0             0   
2           0          1           0         0             0   
3           1          0           1         0             0   

   squirrels apples  squirrels dont  today apple  
0                 0               0            0  
1                 0               0            1  
2                 1               0            0  
3                 0               1            0

## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [36]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 13 

The words in the corpus: 
 {'apple', 'nut', 'run', 'tree', 'squirrel', 'eat', 'live', 'today', 'dont', 'squirrels', 'breakfast', 'apples', 'rather'}

Term Frequency (TF):
    apple     nut     run    tree  squirrel     eat  live   today    dont  \
0  0.0000  0.0000  0.3333  0.3333    0.3333  0.0000  0.00  0.0000  0.0000   
1  0.3333  0.0000  0.0000  0.0000    0.0000  0.0000  0.00  0.3333  0.0000   
2  0.0000  0.0000  0.0000  0.2500    0.0000  0.0000  0.25  0.0000  0.0000   
3  0.0000  0.1429  0.0000  0.0000    0.0000  0.1429  0.00  0.0000  0.1429   

   squirrels  breakfast  apples  rather  
0     0.0000     0.0000  0.0000  0.0000  
1     0.0000     0.3333  0.0000  0.0000  
2     0.2500     0.0000  0.2500  0.0000  
3     0.1429     0.1429  0.1429  0.1429  


### Inverse Document Frequency (IDF)

In [37]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
          apple:     0.6021
            nut:     0.6021
            run:     0.6021
           tree:      0.301
       squirrel:     0.6021
            eat:     0.6021
           live:     0.6021
          today:     0.6021
           dont:     0.6021
      squirrels:      0.301
      breakfast:      0.301
         apples:      0.301
         rather:     0.6021


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [38]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
    apple    nut     run    tree  squirrel    eat    live   today   dont  \
0  0.0000  0.000  0.2007  0.1003    0.2007  0.000  0.0000  0.0000  0.000   
1  0.2007  0.000  0.0000  0.0000    0.0000  0.000  0.0000  0.2007  0.000   
2  0.0000  0.000  0.0000  0.0752    0.0000  0.000  0.1505  0.0000  0.000   
3  0.0000  0.086  0.0000  0.0000    0.0000  0.086  0.0000  0.0000  0.086   

   squirrels  breakfast  apples  rather  
0     0.0000     0.0000  0.0000   0.000  
1     0.0000     0.1003  0.0000   0.000  
2     0.0752     0.0000  0.0752   0.000  
3     0.0430     0.0430  0.0430   0.086  


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [41]:
text = '''Political forces keep avoiding the subject when questioned about
the severity of the incident and how it may affect GDP and other 
financial indicators in the near future.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Political', 'JJ', 'O'),
 ('forces', 'NNS', 'O'),
 ('keep', 'VB', 'O'),
 ('avoiding', 'VBG', 'O'),
 ('the', 'DT', 'B-NP'),
 ('subject', 'NN', 'I-NP'),
 ('when', 'WRB', 'O'),
 ('questioned', 'VBN', 'O'),
 ('about', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('severity', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('incident', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('how', 'WRB', 'O'),
 ('it', 'PRP', 'O'),
 ('may', 'MD', 'O'),
 ('affect', 'VB', 'O'),
 ('GDP', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('other', 'JJ', 'O'),
 ('financial', 'JJ', 'O'),
 ('indicators', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('near', 'JJ', 'I-NP'),
 ('future', 'NN', 'I-NP'),
 ('.', '.', 'O')]


The POS tags results for the first five words are as follows:
Political -> JJ, which means this word is classified as an adjective
forces -> NNS, which classifies this word as a plural noun
keep -> VB, thus the word is considered to be a verb in base form 
avoiding -> VBG, this represents a verb in gerund form (present continuous)
the -> DT, meaning the word 'the' is called a determiner (article the)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [40]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2024-12-10 07:39:37
Python Version: 3.10.5
-----------------------------------
