In [55]:
import re
import nltk
from nltk import pos_tag
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Asif
[nltk_data]     Sayyed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Asif
[nltk_data]     Sayyed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Asif
[nltk_data]     Sayyed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [56]:
# defining the corpus
corpus = ("""In statistics, a rank correlation is any of several statistics that measure an 
          ordinal association—the relationship between rankings of different ordinal variables 
          or different rankings of the same variable, where a "ranking" is the assignment of the 
          ordering labels "first", "second", "third", etc. to different observations of a particular variable. 
          A rank correlation coefficient measures the degree of similarity between two rankings, and can be used to 
          assess the significance of the relation between them. For example, two common nonparametric methods of 
          significance that use rank correlation are the Mann–Whitney U test and the Wilcoxon signed-rank test.""")

# Text Cleaning
- converting text to lowercase
- removing whitespace and non-textual characters
- removing digits

In [57]:
# converting text to lowercase
corpus = corpus.lower()
# df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x) # for dataframe
print(corpus)

in statistics, a rank correlation is any of several statistics that measure an 
          ordinal association—the relationship between rankings of different ordinal variables 
          or different rankings of the same variable, where a "ranking" is the assignment of the 
          ordering labels "first", "second", "third", etc. to different observations of a particular variable. 
          a rank correlation coefficient measures the degree of similarity between two rankings, and can be used to 
          assess the significance of the relation between them. for example, two common nonparametric methods of 
          significance that use rank correlation are the mann–whitney u test and the wilcoxon signed-rank test.


In [58]:
# removing https
pattern = re.compile(r'https?://\S+')
corpus = pattern.sub('', corpus)
print(corpus)

in statistics, a rank correlation is any of several statistics that measure an 
          ordinal association—the relationship between rankings of different ordinal variables 
          or different rankings of the same variable, where a "ranking" is the assignment of the 
          ordering labels "first", "second", "third", etc. to different observations of a particular variable. 
          a rank correlation coefficient measures the degree of similarity between two rankings, and can be used to 
          assess the significance of the relation between them. for example, two common nonparametric methods of 
          significance that use rank correlation are the mann–whitney u test and the wilcoxon signed-rank test.


In [59]:
# removing non-textual characters
pattern = re.compile(r'[^\w\s]')
corpus = pattern.sub('', corpus)
print(corpus)

in statistics a rank correlation is any of several statistics that measure an 
          ordinal associationthe relationship between rankings of different ordinal variables 
          or different rankings of the same variable where a ranking is the assignment of the 
          ordering labels first second third etc to different observations of a particular variable 
          a rank correlation coefficient measures the degree of similarity between two rankings and can be used to 
          assess the significance of the relation between them for example two common nonparametric methods of 
          significance that use rank correlation are the mannwhitney u test and the wilcoxon signedrank test


In [60]:
# removing digits
pattern = re.compile(r'\d')
corpus = pattern.sub('', corpus)
print(corpus)

in statistics a rank correlation is any of several statistics that measure an 
          ordinal associationthe relationship between rankings of different ordinal variables 
          or different rankings of the same variable where a ranking is the assignment of the 
          ordering labels first second third etc to different observations of a particular variable 
          a rank correlation coefficient measures the degree of similarity between two rankings and can be used to 
          assess the significance of the relation between them for example two common nonparametric methods of 
          significance that use rank correlation are the mannwhitney u test and the wilcoxon signedrank test


# Tokenization
Tokenization is the process of breaking down large blocks of text such as paragraphs and sentences into smaller, more manageable units.

`'I see a cup of coffee'` -> `'I'`, `'see'`, `'a'`, `'cup'`, `'of'`, `'coffee'`

In [61]:
tokens = corpus.split()
print(tokens)

['in', 'statistics', 'a', 'rank', 'correlation', 'is', 'any', 'of', 'several', 'statistics', 'that', 'measure', 'an', 'ordinal', 'associationthe', 'relationship', 'between', 'rankings', 'of', 'different', 'ordinal', 'variables', 'or', 'different', 'rankings', 'of', 'the', 'same', 'variable', 'where', 'a', 'ranking', 'is', 'the', 'assignment', 'of', 'the', 'ordering', 'labels', 'first', 'second', 'third', 'etc', 'to', 'different', 'observations', 'of', 'a', 'particular', 'variable', 'a', 'rank', 'correlation', 'coefficient', 'measures', 'the', 'degree', 'of', 'similarity', 'between', 'two', 'rankings', 'and', 'can', 'be', 'used', 'to', 'assess', 'the', 'significance', 'of', 'the', 'relation', 'between', 'them', 'for', 'example', 'two', 'common', 'nonparametric', 'methods', 'of', 'significance', 'that', 'use', 'rank', 'correlation', 'are', 'the', 'mannwhitney', 'u', 'test', 'and', 'the', 'wilcoxon', 'signedrank', 'test']


# Stop-word removal
Stopwords refer to the most commonly occurring words in any natural language.

For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document. Therefore, removing stopwords can help us to focus on the most important information in the text and improve the accuracy of our analysis.

__Before:__ `[This', 'is', 'an', 'example', 'for', 'stop', 'word', 'removal']` <br>
<br>
__After:__ `['This', 'example', 'stop', 'word', 'removal']`

In [62]:
stop_words = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop_words]
print(words)

['statistics', 'rank', 'correlation', 'several', 'statistics', 'measure', 'ordinal', 'associationthe', 'relationship', 'rankings', 'different', 'ordinal', 'variables', 'different', 'rankings', 'variable', 'ranking', 'assignment', 'ordering', 'labels', 'first', 'second', 'third', 'etc', 'different', 'observations', 'particular', 'variable', 'rank', 'correlation', 'coefficient', 'measures', 'degree', 'similarity', 'two', 'rankings', 'used', 'assess', 'significance', 'relation', 'example', 'two', 'common', 'nonparametric', 'methods', 'significance', 'use', 'rank', 'correlation', 'mannwhitney', 'u', 'test', 'wilcoxon', 'signedrank', 'test']


# Stemming and Lemmatization
![Stemming vs Lemmatization](1_OYRDkAUMOWQ1Qx5T_4p8hQ.webp)

There are various algorithms that can be used for stemming,

- Porter Stemmer algorithm
- Snowball Stemmer algorithm
- Lovins Stemmer algorithm