## Text Preprocessing

### 0. Create Data Sets

In [1]:
import pandas as pd

In [2]:
# create a list of sentences
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [3]:
data

['When life gives you lemons, make lemonade! 🙂',
 'She bought 2 lemons for $1 at Maven Market.',
 'A dozen lemons will make a gallon of lemonade. [AllRecipes]',
 'lemon, lemon, lemons, lemon, lemon, lemons',
 "He's running to the market to get a lemon — there's a great sale today.",
 'Does Maven Market carry Eureka lemons or Meyer lemons?',
 'An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]',
 'iced tea is my favorite']

In [4]:
data_df = pd.DataFrame(data, columns=['sentence'])
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade....
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — th...
5,Does Maven Market carry Eureka lemons or Meyer...
6,"An Arnold Palmer is half lemonade, half iced t..."
7,iced tea is my favorite


In [5]:
# Change pandas options to display the full sentence in the data franme columns.
pd.set_option('display.max_colwidth', None)
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [6]:
# create a test series of sentences for later on
test = [
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications."
]

test_series = pd.Series(test)
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

### 1. Text Preprocessing with Pandas

In [7]:
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [8]:
# Make copy to keep regular original data
df = data_df.copy()
df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [9]:
# lowercase
df['sentence_clean'] = df['sentence'].str.lower()
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea. [wikipedia]"
7,iced tea is my favorite,iced tea is my favorite


In [10]:
# remove text inside []
df['sentence_clean'] = df['sentence_clean'].str.replace(r'\[.*?\]', '', regex=True)
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade.
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea."
7,iced tea is my favorite,iced tea is my favorite


In [11]:
# remove punctuation
df['sentence_clean'] = df['sentence_clean'].str.replace(r'[^\w\s]', '', regex=True)
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [12]:
# Create a Function to apply the 3 steps we did to preprocess text
def lower_replace(series):
    output = series.str.lower()
    output = output.str.replace(r'\[.*?\]', '', regex=True)
    output = output.str.replace(r'[^\w\s]', '', regex=True)
    return output

In [13]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [14]:
lower_replace(test_series)

0    were going to start this course with traditional nlp applications
1                               then well move on to modern nlp theory
2             finally well wrap things up with modern nlp applications
dtype: object

In [15]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [16]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

### 2. Text Preprocessing with spaCy

In [17]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [21]:
import spacy # specifically have to download this outside of jupyter notebook. https://www.udemy.com/course/nlp-in-python-2025/learn/lecture/50347285#questions

'''
conda activate env_name
python -m spacy download en_core_web_sm
'''

'\nconda activate env_name\npython -m spacy download en_core_web_sm\n'

In [22]:
nlp = spacy.load('en_core_web_sm') # import english model.

In [25]:
phrase = df.sentence_clean[0]
phrase

'when life gives you lemons make lemonade '

In [27]:
doc = nlp(phrase)
doc

when life gives you lemons make lemonade 

In [29]:
# Tokenize
[token.text for token in doc]

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

In [36]:
# lemmatize
[token.lemma_ for token in doc]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

In [37]:
# stop words
norm = [token.lemma_ for token in doc if not token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [38]:
list(nlp.Defaults.stop_words)[:10]

['their',
 'cannot',
 'go',
 'several',
 'there',
 'per',
 'otherwise',
 'even',
 'also',
 'anything']

In [39]:
' '.join(norm) # join strings with a space in between

'life give lemon lemonade'

In [40]:
def token_lemma_nonstop(text):
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(output) # join strings with a space in between
    return output

In [41]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [42]:
test_series.apply(token_lemma_nonstop) # apply means that we can apply a function to every row of data.

0    go start course traditional NLP application .
1                              modern NLP theory .
2    finally , wrap thing modern NLP application .
dtype: object

In [44]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [45]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [46]:
lower_replace(df.sentence).apply(token_lemma_nonstop) # apply means that we can apply a function to every row of data.

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

# Parts of Speech Tagging

In [47]:
phrase2 = lower_replace(df.sentence).apply(token_lemma_nonstop)[0]
phrase2

'life give lemon lemonade'

In [48]:
doc2 = nlp(phrase2)
doc2

life give lemon lemonade

In [50]:
[(token.text, token.pos_) for token in doc2]

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [51]:
[(token.text, token.pos_) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]

[('life', 'NOUN'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [53]:
nouns = [token.text for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]
nouns

['life', 'lemon', 'lemonade']

In [54]:
' '.join(nouns)

'life lemon lemonade'

In [60]:
def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [token.text for token in doc if token.pos_ in pos_list]
    output = ' '.join(output)
    return output

In [61]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [62]:
test_series.apply(filter_pos)

0    course NLP applications
1                 NLP theory
2    things NLP applications
dtype: object

In [63]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [64]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [65]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [66]:
lower_replace(df.sentence).apply(token_lemma_nonstop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

In [67]:
lower_replace(df.sentence).apply(token_lemma_nonstop).apply(filter_pos)

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [68]:
# Test. Change order of functions
lower_replace(df.sentence).apply(filter_pos).apply(token_lemma_nonstop)

0              life lemon lemonade
1               lemon maven market
2      dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon
4          market lemon sale today
5        market eureka lemon lemon
6              palmer lemonade tea
7                              tea
Name: sentence, dtype: object

### 3. NLP Pipeline

In [69]:
# Create a Function to apply the 3 steps we did to preprocess text
def lower_replace(series):
    output = series.str.lower()
    output = output.str.replace(r'\[.*?\]', '', regex=True)
    output = output.str.replace(r'[^\w\s]', '', regex=True)
    return output

def token_lemma_nonstop(text):
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(output) # join strings with a space in between
    return output

def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [token.text for token in doc if token.pos_ in pos_list]
    output = ' '.join(output)
    return output

In [70]:
def nlp_pipeline(series):
    output = lower_replace(series)
    output = output.apply(token_lemma_nonstop)
    output = output.apply(filter_pos)
    return output

In [71]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [73]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [74]:
nlp_pipeline(test_series)

0          nlp application
1               nlp theory
2    thing nlp application
dtype: object

In [76]:
text_clean = nlp_pipeline(df.sentence)
text_clean

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [None]:
pd.to_pickle(text_clean, 'Data/text_clean.pkl')