 **Code Snippets for Text Preprocessing Techniques in Natural Language Processing**

In [1]:
def convert_to_lower_case(text):
    return text.lower()

sample_text = "Sometimes it pays to stay in bed on Monday, rather than spending the rest of the week debugging Monday’s code. – Dan Salomon"

convert_to_lower_case(text=sample_text)

'sometimes it pays to stay in bed on monday, rather than spending the rest of the week debugging monday’s code. – dan salomon'

In [2]:
def remove_whitespace(text):
    return " ".join(text.split())

sample_text = "Talk is cheap. Show me the code.     ―     Linus  Torvalds"
remove_whitespace(text=sample_text)

'Talk is cheap. Show me the code. ― Linus Torvalds'

In [3]:
import re
def remove_punctuations(text):
    return re.sub('[^\w\s]', '', text)

sample_text = 'Programming is about managing complexity: the complexity of the problem, laid upon the complexity of the machine. Because of this complexity, most of our programming projects fail.'
remove_punctuations(sample_text)

'Programming is about managing complexity the complexity of the problem laid upon the complexity of the machine Because of this complexity most of our programming projects fail'

In [4]:
!pip install contractions



In [5]:
import contractions

def expand_contractions(text):
    return contractions.fix(text)

sample_text = "I'm not a great programmer; I'm just a good programmer with great habits. - Kent Beck"
expand_contractions(sample_text)

'I am not a great programmer; I am just a good programmer with great habits. - Kent Beck'

In [6]:
!pip install TextBlob



In [7]:
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [8]:
from textblob import TextBlob

def correct_spelling(text):
    text = TextBlob(text)
    correct_text = text.correct()
    return correct_text

sample_text = "Truht can only be fonud in one place: the cdoe."
correct_spelling(sample_text)

TextBlob("Truth can only be found in one place: the code.")

In [9]:
!pip install nltk



In [10]:
from nltk import word_tokenize

def tokenzie_word(text):
    return word_tokenize(text)

sample_text = "Object-oriented programming offers a sustainable way to write spaghetti code. It lets you accrete programs as a series of patches. ― Paul Graham"
tokenzie_word(sample_text)

['Object-oriented',
 'programming',
 'offers',
 'a',
 'sustainable',
 'way',
 'to',
 'write',
 'spaghetti',
 'code',
 '.',
 'It',
 'lets',
 'you',
 'accrete',
 'programs',
 'as',
 'a',
 'series',
 'of',
 'patches',
 '.',
 '―',
 'Paul',
 'Graham']

In [11]:
from nltk import sent_tokenize

def tokenzie_sent(text):
    return sent_tokenize(text)

sample_text = "Object-oriented programming offers a sustainable way to write spaghetti code. It lets you accrete programs as a series of patches. ― Paul Graham"
tokenzie_sent(sample_text)

['Object-oriented programming offers a sustainable way to write spaghetti code.',
 'It lets you accrete programs as a series of patches.',
 '― Paul Graham']

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
from nltk import word_tokenize
stopwords_list = stopwords.words('english')

def eliminate_stopwords(text):
    tokenized_text = word_tokenize(text)
    without_stopwords = [word for word in tokenized_text if not word in stopwords_list]
    return " ".join(without_stopwords)

sample_text = "Software and cathedrals are much the same — first we build them, then we pray."
eliminate_stopwords(sample_text)

'Software cathedrals much — first build , pray .'

In [14]:
from nltk.stem import PorterStemmer
from nltk import word_tokenize

stemmer = PorterStemmer()

def stem_words(text):
    tokenized_text = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in tokenized_text]
    return " ".join(stemmed_words)

sample_text = "One: Demonstrations always crash. And two: The probability of them crashing goes up exponentially with the number of people watching."
stem_words(sample_text)

'one : demonstr alway crash . and two : the probabl of them crash goe up exponenti with the number of peopl watch .'

In [15]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    tokenized_text = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in tokenized_text]
    return " ".join(lemmatized_words)

sample_text = "Software undergoes beta testing shortly before it’s released. Beta is Latin for still doesn’t work."
lemmatize_words(sample_text)

'Software undergo beta test shortly before it ’ s release . Beta be Latin for still doesn ’ t work .'

In [16]:
lemma = WordNetLemmatizer()
lemma.lemmatize("demonstrations")

'demonstration'