[Reference](https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae)

# Convert text to lower case

In [3]:
import nltk
nltk.download('punkt')
def to_lower(text):
    """
    Converting text to lower case as in, converting "Hello" to  "hello" or "HELLO" to "hello".
    """
    return ' '.join([w.lower() for w in nltk.word_tokenize(text)])
text = """Harry Potter is the most miserable, lonely boy you can imagine. He's shunned by his relatives, the Dursley's, that have raised him since he was an infant. He's forced to live in the cupboard under the stairs, forced to wear his cousin Dudley's hand-me-down clothes, and forced to go to his neighbour's house when the rest of the family is doing something fun. Yes, he's just about as miserable as you can get."""
print (to_lower(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
harry potter is the most miserable , lonely boy you can imagine . he 's shunned by his relatives , the dursley 's , that have raised him since he was an infant . he 's forced to live in the cupboard under the stairs , forced to wear his cousin dudley 's hand-me-down clothes , and forced to go to his neighbour 's house when the rest of the family is doing something fun . yes , he 's just about as miserable as you can get .


# Remove Tags

In [4]:
import re
text = """<head><body>hello world!</body></head>"""
cleaned_text = re.sub('<[^<]+?>','', text)
print (cleaned_text)

hello world!


# Remove Numbers

In [5]:
text = "There was 200 people standing right next to me at 2pm."
output = ''.join(c for c in text if not c.isdigit())
print(output)

There was  people standing right next to me at pm.


# Remove punctuation

In [6]:
from string import punctuation
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)
text = "Hello! how are you doing?"
print (strip_punctuation(text))

Hello how are you doing


# Lemmatize

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#is based on The Porter Stemming Algorithm
stopword = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
text = 'the functions of this fan is great'
word_tokens = nltk.word_tokenize(text)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
print (lemmatized_word)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
['the', 'function', 'of', 'this', 'fan', 'is', 'great']


# Stemming

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#is based on The Porter Stemming Algorithm
stopword = stopwords.words('english')
snowball_stemmer = SnowballStemmer('english')
text = 'This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit'
word_tokens = nltk.word_tokenize(text)
stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
print (stemmed_word)

['this', 'is', 'a', 'demo', 'text', 'for', 'nlp', 'use', 'nltk', '.', 'full', 'form', 'of', 'nltk', 'is', 'natur', 'languag', 'toolkit']


# word tokenize

In [14]:
text = 'This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit'
word_tokens = nltk.word_tokenize(text)
print (word_tokens)

['This', 'is', 'a', 'Demo', 'Text', 'for', 'NLP', 'using', 'NLTK', '.', 'Full', 'form', 'of', 'NLTK', 'is', 'Natural', 'Language', 'Toolkit']


# sentence tokenize

In [15]:
text = 'This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit'
sent_token = nltk.sent_tokenize(text)
print (sent_token)

['This is a Demo Text for NLP using NLTK.', 'Full form of NLTK is Natural Language Toolkit']


# Stop words removal

In [17]:
import nltk
from nltk.corpus import stopwords
stopword = stopwords.words('english')
text = 'This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit'
word_tokens = nltk.word_tokenize(text)
removing_stopwords = [word for word in word_tokens if word not in stopword]
print (removing_stopwords)

['This', 'Demo', 'Text', 'NLP', 'using', 'NLTK', '.', 'Full', 'form', 'NLTK', 'Natural', 'Language', 'Toolkit']


# Contraction

In [20]:
# coding: utf-8
import re
import nltk
!pip install contractions
from contractions import contractions_dict

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
def main():
    text = """I ain't going there. You'll have to go alone."""
    
    text=expand_contractions(text,contractions_dict)
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    print (tokenized_sentences)
if __name__ == '__main__':
    main()

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 3.9MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 17.0MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

# Spell Check

In [22]:
!pip install autocorrect
from autocorrect import spell
text = "This is a wrld of hope"
spells = [spell(w) for w in (nltk.word_tokenize(text))]
print (spells)

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/aa/5b/6510d8370201fc96cbb773232c2362079389ed3285b0b1c6a297ef6eadc0/autocorrect-2.0.0.tar.gz (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.8MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.0.0-cp36-none-any.whl size=1811640 sha256=bf4b52fc6a7945ab1858e28200f528bbc90ef105b067764f902abb06cfe93ce9
  Stored in directory: /root/.cache/pip/wheels/0b/06/bc/e66f28d72bed29591eadc79cebb2e7964ad0282804ab233da3
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.0.0
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is