<center><h1>Text Preprocessing</h1></center>

## Text preprocessing is one of the important step while working on text. 
## We have to spend more time in cleaning the data. 

# Import libraries

In [97]:
import re
import nltk
import string
import itertools
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [98]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

## 1. Remove URL's from the text.

In [99]:
def remove_url_from_text(text):
    text = re.sub(r"http\S+", "", text)
    return text

## 2. Remove Email id from the text.

In [100]:
def remove_email_from_text(text):
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', text, flags=re.MULTILINE)
    return text

## 3. Remove extra characters from the text.
`Example: sooooo -> so`

In [101]:
def remove_extra_char_from_words(text):
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    return text

## 4. Remove symbols  from the text.

In [102]:
def remove_symbols(text):
    text = re.sub(r'[^A-Za-z\s]',r'',text)
    text = re.sub(r'\n',r'',text)
    return text

## 5. Tokenize the text.

In [103]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

## 6. Convert text to lower case.

In [104]:
def text_to_lower_case(tokens):
    tokens = [word.lower() for word in tokens]
    return tokens

## 7. Remove punctuation from the text.

In [105]:
def remove_punctuation(tokens):
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    tokens = [word for word in stripped if word.isalpha()]
    return tokens

## 8. Remove stop words from the text.

In [106]:
def remove_stopwords(tokens):
    stop_words = stopwords.words('english')
    token_list = []
    for word in tokens:
        if not word in stop_words:
            token_list.append(word)
    return token_list

## 9. Stem the words.

In [107]:
def stem_words(tokens):
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

## 10. Lemmatize words.

In [108]:
def lemmatize_words(tokens):
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

# Use the above functions to preprocess text.

In [112]:
def preprocess_text(text):
    text = remove_url_from_text(text)
    text = remove_email_from_text(text)
    text = remove_extra_char_from_words(text)
    text = remove_symbols(text)
    tokens = tokenize_text(text)
    tokens = text_to_lower_case(tokens)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = stem_words(tokens)
    tokens = lemmatize_words(tokens)
    return tokens

## Test sentence

In [113]:
input_text = 'My gmail id is google@gmail.com. The url of my website is https://google.com.'
tokens = preprocess_text(input_text)
print(tokens)

['gmail', 'id', 'url', 'websit']
