### Tokenization ###

In [33]:
corpus= """To generate a paragraph on nature, you can use one of many AI-powered paragraph generator tools available online. 
These tools use artificial intelligence and natural language processing to create original content based on your specifications, such as topic, tone, and length.
The transformer architecture is a deep learning model that uses the self-attention mechanism to process sequential data, like text, without relying on recurrence. 
This architecture, introduced in the 2017 paper "Attention Is All You Need", is structured around an encoder and a decoder, which efficiently process an input sequence and generate an output sequence by capturing relationships between elements regardless of their distance. """

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

sentences = sent_tokenize(corpus)
words = word_tokenize(corpus)
print(sentences)
print(words)
type(sentences)
type(words)

['To generate a paragraph on nature, you can use one of many AI-powered paragraph generator tools available online.', 'These tools use artificial intelligence and natural language processing to create original content based on your specifications, such as topic, tone, and length.']
['To', 'generate', 'a', 'paragraph', 'on', 'nature', ',', 'you', 'can', 'use', 'one', 'of', 'many', 'AI-powered', 'paragraph', 'generator', 'tools', 'available', 'online', '.', 'These', 'tools', 'use', 'artificial', 'intelligence', 'and', 'natural', 'language', 'processing', 'to', 'create', 'original', 'content', 'based', 'on', 'your', 'specifications', ',', 'such', 'as', 'topic', ',', 'tone', ',', 'and', 'length', '.']


list

In [3]:
from nltk.tokenize import wordpunct_tokenize

word_punct = wordpunct_tokenize(corpus)
print(word_punct)

['To', 'generate', 'a', 'paragraph', 'on', 'nature', ',', 'you', 'can', 'use', 'one', 'of', 'many', 'AI', '-', 'powered', 'paragraph', 'generator', 'tools', 'available', 'online', '.', 'These', 'tools', 'use', 'artificial', 'intelligence', 'and', 'natural', 'language', 'processing', 'to', 'create', 'original', 'content', 'based', 'on', 'your', 'specifications', ',', 'such', 'as', 'topic', ',', 'tone', ',', 'and', 'length', '.']


### Stemming ###

In [4]:
words=['eat','eating','eaten','eats','ran','running','runs','runner','ate','bats','batted','battering','dance','danced','dancing']

from nltk.stem import PorterStemmer
ps = PorterStemmer()

for w in words:
    print(f"{w} --> {ps.stem(w)}")

eat --> eat
eating --> eat
eaten --> eaten
eats --> eat
ran --> ran
running --> run
runs --> run
runner --> runner
ate --> ate
bats --> bat
batted --> bat
battering --> batter
dance --> danc
danced --> danc
dancing --> danc


In [5]:
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|ed$|s$|able$|er$')

for w in words:
    print(f"{w} --> {rs.stem(w)}")

eat --> eat
eating --> eat
eaten --> eaten
eats --> eat
ran --> ran
running --> runn
runs --> run
runner --> runn
ate --> ate
bats --> bat
batted --> batt
battering --> batter
dance --> dance
danced --> danc
dancing --> danc


In [6]:
from nltk.stem import SnowballStemmer
ss = SnowballStemmer('english')

for w in words:
    print(f"{w} --> {ss.stem(w)}")

eat --> eat
eating --> eat
eaten --> eaten
eats --> eat
ran --> ran
running --> run
runs --> run
runner --> runner
ate --> ate
bats --> bat
batted --> bat
battering --> batter
dance --> danc
danced --> danc
dancing --> danc


### Lemmatization ###

In [10]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

for w in words:
    print(f"{w} --> {wnl.lemmatize(w, pos='v')}")

eat --> eat
eating --> eat
eaten --> eat
eats --> eat
ran --> run
running --> run
runs --> run
runner --> runner
ate --> eat
bats --> bat
batted --> bat
battering --> batter
dance --> dance
danced --> dance
dancing --> dance


In [11]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

for w in words:
    print(f"{w} --> {wnl.lemmatize(w, pos='n')}")

eat --> eat
eating --> eating
eaten --> eaten
eats --> eats
ran --> ran
running --> running
runs --> run
runner --> runner
ate --> ate
bats --> bat
batted --> batted
battering --> battering
dance --> dance
danced --> danced
dancing --> dancing


In [12]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

for w in words:
    print(f"{w} --> {wnl.lemmatize(w, pos='a')}")

eat --> eat
eating --> eating
eaten --> eaten
eats --> eats
ran --> ran
running --> running
runs --> runs
runner --> runner
ate --> ate
bats --> bats
batted --> batted
battering --> battering
dance --> dance
danced --> danced
dancing --> dancing


### Text Preprocessing - Filtering Stop words , Stemming and Lemmatization ###

In [None]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk

stemmer = SnowballStemmer('english')


# stopwords.words('english')
sentences= nltk.tokenize.sent_tokenize(corpus)

#Apply Stopwords Removal and Stemming

for i in range(len(sentences)):
    words =nltk.word_tokenize(sentences[i])
    words=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words) # Convert list of words back to sentence

In [28]:
print(f"Snowball Stemming and Removal of Stopwords:",sentences)

Snowball Stemming and Removal of Stopwords: ['to generat paragraph natur , use one mani ai-pow paragraph generat tool avail onlin .', 'these tool use artifici intellig natur languag process creat origin content base specif , topic , tone , length .']


In [34]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

lemmatizer = WordNetLemmatizer()


# stopwords.words('english')
sentences= nltk.tokenize.sent_tokenize(corpus)

#Apply Stopwords Removal and Stemming

for i in range(len(sentences)):
    words =nltk.word_tokenize(sentences[i])
    words=[lemmatizer.lemmatize(word.lower(),pos='v') for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words) # Convert list of words back to sentence

In [35]:
print(f"WordNet Lemmatizing and Removal of Stopwords:",sentences)

WordNet Lemmatizing and Removal of Stopwords: ['to generate paragraph nature , use one many ai-powered paragraph generator tool available online .', 'these tool use artificial intelligence natural language process create original content base specifications , topic , tone , length .', 'the transformer architecture deep learn model use self-attention mechanism process sequential data , like text , without rely recurrence .', "this architecture , introduce 2017 paper `` attention be all you need '' , structure around encoder decoder , efficiently process input sequence generate output sequence capture relationships elements regardless distance ."]
