[Reference](https://medium.com/@abdallahashraf90x/text-pre-processing-for-nlp-95cef3ad6bab)

# 1. Preliminaries
## a) Sentence segmentation

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

mytext = """ GPT-4 image analysis goes beyond describing the picture.
In the same demonstration Vee watched, an OpenAI representative sketched an
image of a simple website and fed the drawing to GPT-4. Next the model was
 asked to write the code required to produce such a website—and it did.
 “It looked basically like what the image is. It was very, very simple,
 but it worked pretty well,” says Jonathan May, a research associate professor
 at the University of Southern California. “So that was cool.” """

my_sentences = sent_tokenize(mytext)

## b) Word tokenization

In [4]:
for sentence in my_sentences:
  print(sentence)
  print(word_tokenize(sentence))

 GPT-4 image analysis goes beyond describing the picture.
['GPT-4', 'image', 'analysis', 'goes', 'beyond', 'describing', 'the', 'picture', '.']
In the same demonstration Vee watched, an OpenAI representative sketched an
image of a simple website and fed the drawing to GPT-4.
['In', 'the', 'same', 'demonstration', 'Vee', 'watched', ',', 'an', 'OpenAI', 'representative', 'sketched', 'an', 'image', 'of', 'a', 'simple', 'website', 'and', 'fed', 'the', 'drawing', 'to', 'GPT-4', '.']
Next the model was
 asked to write the code required to produce such a website—and it did.
['Next', 'the', 'model', 'was', 'asked', 'to', 'write', 'the', 'code', 'required', 'to', 'produce', 'such', 'a', 'website—and', 'it', 'did', '.']
“It looked basically like what the image is.
['“', 'It', 'looked', 'basically', 'like', 'what', 'the', 'image', 'is', '.']
It was very, very simple,
 but it worked pretty well,” says Jonathan May, a research associate professor
 at the University of Southern California.
['It', 'w

# 2. Frequent Steps

In [7]:
from nltk.corpus import stopwords
from string import punctuation

def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        return [token.lower() for token in tokens if token not in mystopwords and
        not token.isdigit() and token not in punctuation]
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

## Stemming and lemmatization

In [8]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
word1, word2 = "cars", "revolution"
print(stemmer.stem(word1), stemmer.stem(word2))

car revolut


In [11]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a")) #a is for adjective

[nltk_data] Downloading package wordnet to /root/nltk_data...


good


In [12]:
import spacy
sp = spacy.load('en_core_web_sm')
token = sp(u'better')
for word in token:
  print(word.text, word.lemma_)

better well
