## Examples of preprocessing text

### Importing libraries: 

In [10]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import numpy as np
import pandas as pd
import re

In [11]:
dataset = ['I am enjoying my time at this meeting in Virginia. Virginia is a beautiful city but I dont like the traffic!']

dataset = pd.DataFrame(dataset)

dataset.head()

Unnamed: 0,0
0,I am enjoying my time at this meeting in Virgi...


### Inspecting data

* some of the nltk builtin methods that we could use to prepare data for inspecting include:

    * nltk.tokenizer.word_tokenize()
    * nltk.PorterStemmer or SnowballStemmer class to recude words into their stems
    * nltk. tokenize.WhitespaceTokenizer() extracts tokens from string of words or sentences without whitespaces, new line, or tabs

### Stemming Example: 

In [12]:
# Stemming Example: 
import nltk
# nltk.download('punkt')

stemmer = PorterStemmer()

sentence = "Sentiment analysis is one of the subfields of Natural Language Processing"
words = word_tokenize(sentence)

for w in words:
    print(w, " : ", stemmer.stem(w))

Sentiment  :  sentiment
analysis  :  analysi
is  :  is
one  :  one
of  :  of
the  :  the
subfields  :  subfield
of  :  of
Natural  :  natur
Language  :  languag
Processing  :  process


[nltk_data] Downloading package punkt to /Users/axa4/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Tokenizer Example:

In [14]:
# tokenizer example

sentencce = "Sentiment analysis is one of the subfields of Natural Language Processing."

words = word_tokenize(sentence)

print(words)


['Sentiment', 'analysis', 'is', 'one', 'of', 'the', 'subfields', 'of', 'Natural', 'Language', 'Processing']


### Whitespace Tokenizer Example: 

In [15]:
# Example of Whitespace tokenizer

from nltk.tokenize import WhitespaceTokenizer

# Create an instance of WhitespaceTokenizer
white_tokenizer = WhitespaceTokenizer( )

# Example input
sentence = " This  is  a \nsentence with \n endlines \n and a tab\t\n "
print(sentence)

# Tokenize the sentence 
sentence = white_tokenizer.tokenize(sentence)

print(sentence)


 This  is  a 
sentence with 
 endlines 
 and a tab	
 
['This', 'is', 'a', 'sentence', 'with', 'endlines', 'and', 'a', 'tab']


### Regex Tokenizer Example:

In [5]:
# Example of RegexpTokenizer() is another useful nltk tokenizer that is used to remove punctuation from text.

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize ('This is a first-hand experience of nlp!!! Keep cleaning the data! ')

['This',
 'is',
 'a',
 'first',
 'hand',
 'experience',
 'of',
 'nlp',
 'Keep',
 'cleaning',
 'the',
 'data']

### Continue Preprocessing data : 

* Preprocessing text consists of normalizing it with various methods, such as cleaning up punctuation, stop words, stemming, etc. 

In [16]:
import nltk
from nltk.corpus import stopwords

set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r