# Cleaning Text

In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [3]:
text_data = [" Interrobang. By Aishwarya Henriette",
             "Parking And Going. By Karl Gautier", 
             " Today Is The night. By Jarek Prakash"]

In [4]:
strip_data = [string.strip() for string in text_data]

In [5]:
strip_data

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [6]:
new_data = [string.replace('.', '') for string in strip_data]

In [7]:
new_data

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [14]:
def capitalize(x):
    return x.title() # capitalize, upper, lower, 

In [15]:
capitalized = [capitalize(string) for string in new_data]

In [16]:
capitalized

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The Night By Jarek Prakash']

In [17]:
import re

In [21]:
def sub_with_X(x):
    return re.sub(r'[a-zA-Z]', 'X', x)

In [22]:
replaced = [sub_with_X(string) for string in capitalized]

In [23]:
replaced

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

# Removing Punctuation

In [31]:
import unicodedata

In [32]:
import sys

In [33]:
text_data = ['Hi!!!! I. Love. This. Song....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

In [35]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

In [36]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

# Tokenizing Text

In [37]:
from nltk.tokenize import word_tokenize

In [41]:
import nltk

In [42]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/david/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [57]:
string ='the science of today is the technology of tomorrow'

In [58]:
word_tokenize(string)

['the', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [59]:
from nltk.tokenize import sent_tokenize

In [60]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."

In [63]:
two_ = sent_tokenize(string)
two_

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

In [64]:
[string.replace('.', '') for string in two_]

['The science of today is the technology of tomorrow', 'Tomorrow is today']

# Removing Stop Words

In [65]:
from nltk.corpus import stopwords

In [69]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/david/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [70]:
tokenized_words = ['i','am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']

In [71]:
stop = stopwords.words('english')

In [74]:
[word for word in tokenized_words if word not in stop]

['going', 'go', 'store', 'park']

# Stemming Words

In [76]:
from nltk.stem.porter import PorterStemmer

In [77]:
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

In [78]:
model = PorterStemmer()

In [79]:
[model.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

# Tagging Parts of Speech

In [80]:
from nltk import pos_tag

In [81]:
from nltk import word_tokenize

In [84]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/david/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [85]:
text_data = "Chris loved outdoor running"

In [87]:
tagged_text = pos_tag(word_tokenize(text_data))

In [88]:
[word for word,tag in tagged_text if tag in ['NN','NNS','NNP','NNPS']]

['Chris']

In [90]:
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field", 
          "San Francisco is an awesome city"]

In [92]:
tagged_tweets = []

In [93]:
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append(tag for word, tag in tweet_tag)

In [96]:
from sklearn.preprocessing import MultiLabelBinarizer

In [97]:
one_hot = MultiLabelBinarizer()

In [98]:
one_hot.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [99]:
one_hot.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [100]:
from nltk.corpus import brown 
from nltk.tag import UnigramTagger 
from nltk.tag import BigramTagger 
from nltk.tag import TrigramTagger

In [102]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/david/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [103]:
sentences = brown.tagged_sents(categories='news')

In [104]:
train = sentences[:4000]
test = sentences[4000:]

In [105]:
unigram = UnigramTagger(train) 
bigram = BigramTagger(train, backoff=unigram) 
trigram = TrigramTagger(train, backoff=bigram)

In [106]:
trigram.evaluate(test)

0.8174734002697437

# Encoding Text as a Bag of Words

In [108]:
from sklearn.feature_extraction.text import CountVectorizer

In [109]:
text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])

In [110]:
model = CountVectorizer()

In [114]:
trans = model.fit_transform(text_data)
model.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [116]:
trans.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [118]:
print(trans)

  (0, 6)	1
  (0, 3)	2
  (1, 7)	1
  (1, 5)	1
  (1, 1)	1
  (2, 4)	1
  (2, 0)	1
  (2, 2)	1


In [137]:
model2 = CountVectorizer(ngram_range=(1,2),
                       stop_words='english')

In [138]:
bag = model2.fit_transform(text_data)

In [139]:
bag.toarray()

array([[0, 0, 2, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [1, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

In [140]:
model2.get_feature_names()

['beats',
 'best',
 'brazil',
 'brazil brazil',
 'germany',
 'germany beats',
 'love',
 'love brazil',
 'sweden',
 'sweden best']

In [144]:
model3 = CountVectorizer(ngram_range=(1,2),
                       stop_words='english',
                        vocabulary=['brazil'])

In [149]:
final = model3.fit_transform(text_data)
model3.get_feature_names()

['brazil']

In [150]:
final.toarray()

array([[2],
       [0],
       [0]])

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [153]:
model4 = TfidfVectorizer()

In [156]:
model4.fit_transform(text_data).toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [158]:
model4.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [166]:
model4.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}