# NLP: Text Processing In Data Science Projects
- [Reference](https://medium.com/fintechexplained/nlp-text-processing-in-data-science-projects-f083009d78fc)

In [1]:
!pip install nltk



In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 1. Convert Text To Lowercase

In [3]:
text = 'This is an NLP article of FinTechExplained'

lower_case_text = lowercase(text)
print(lower_case_text)

## 2. Tokenise Paragraphs To Sentences

In [4]:
text = 'FinTechExplained aims to explain how text processing works.  Once we have gathered the text, the next stage is about cleaning and consolidating the text. It is important to ensure the text is standardised and the noise is removed so that efficient analysis can be performed on the text to derive meaningful insights.'
list = sent_tokenize(text)
print(list)

['FinTechExplained aims to explain how text processing works.', 'Once we have gathered the text, the next stage is about cleaning and consolidating the text.', 'It is important to ensure the text is standardised and the noise is removed so that efficient analysis can be performed on the text to derive meaningful insights.']


## 3. Tokenise Sentences To Words

In [5]:
tokenizer = TreebankWordTokenizer()
text = 'FinTechExplained aims to explain how text processing works.  Once we have gathered the text, the next stage is about cleaning and consolidating the text. It is important to ensure the text is standardised and the noise is removed so that efficient analysis can be performed on the text to derive meaningful insights.'
print(tokenizer.tokenize(text))

['FinTechExplained', 'aims', 'to', 'explain', 'how', 'text', 'processing', 'works.', 'Once', 'we', 'have', 'gathered', 'the', 'text', ',', 'the', 'next', 'stage', 'is', 'about', 'cleaning', 'and', 'consolidating', 'the', 'text.', 'It', 'is', 'important', 'to', 'ensure', 'the', 'text', 'is', 'standardised', 'and', 'the', 'noise', 'is', 'removed', 'so', 'that', 'efficient', 'analysis', 'can', 'be', 'performed', 'on', 'the', 'text', 'to', 'derive', 'meaningful', 'insights', '.']


## 4. Remove Numbers

In [6]:
import re
result = re.sub(r'\d+', '', '909FinTechExplained9876')
print(result)

FinTechExplained


## 5. Remove Punctuation

In [7]:
punctuation = string.punctuation
words = ['You','Are','Reading','FinTechExplained', '!', 'NLP', '.']
clean_words = [w for w in words if w not in punctuation]
clean_words = ['You','Are','Reading','FinTechExplained', 'NLP']

## 6. Remove Stop words

In [8]:
text = 'FinTechExplained is an important publication'
words = nltk.word_tokenize(text)
stopwords = stopwords.words('english')
clean = [w for w in words if w not in stopwords]
print(clean)

['FinTechExplained', 'important', 'publication']


## 7. Remove Whitespaces

In [9]:
' '.join('FinTechExplained Is A      Publication. \n This is about NLP'.split())

'FinTechExplained Is A Publication. This is about NLP'