# Handling Texture  

In [1]:
import numpy as np
import pandas as pd

## Cleaning Text
clean the blank space, replace meaningless char, split and generate useful information

In [2]:
text_data = [" Interrobang. By Aishwarya Henriette ",
             "Parking And Going. By Karl Gautier",
             " Today Is The night. By Jarek Prakash "]
strip_whitespace = [string.strip() for string in text_data]
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [3]:
remove_periods = [string.replace(".", "") for string in strip_whitespace]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

### Create and Apply a custom transformation function

In [4]:
def capitalizer(string: str) -> str:
    return string.upper()


[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

### Use regular expression to make powerful string operation

In [5]:
import re


def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)


[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

for more information about regular expression, see [Beginner Tutorial for Regular Expression in Python](https://docs.python.org/zh-cn/3/library/re.html)

## Parsing and Cleaning HTML

In [6]:
from bs4 import BeautifulSoup

html = """
<div class='full_name'><span style='font-weight:bold'>
Masego</span> Azra</div>"
"""
soup = BeautifulSoup(html, 'lxml')
soup.find("div", {"class": "full_name"}).text

'\nMasego Azra'

## Removing Punctuation

In [7]:
import unicodedata
import sys


text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

punctuation = dict.fromkeys(i for i in range(
    sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

while remove punctuation, Be careful that it is often a necessary evil to create features while it should be careful to remove punctuation because punctuation can sometime contain some meaning("Right?" versus "Right!")

## Tokenizing Text

In [8]:
from nltk.tokenize import word_tokenize
string = "The science of today is the technology of tomorrow"
word_tokenize(string)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\AdamSmith/nltk_data'
    - 'D:\\software\\anaconda3\\nltk_data'
    - 'D:\\software\\anaconda3\\share\\nltk_data'
    - 'D:\\software\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\AdamSmith\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


## Removing Stop Words

In [9]:
from nltk.corpus import stopwords
tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park']
stop_words = stopwords.words('english')
[word for word in tokenized_words if word not in stop_words]

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\AdamSmith/nltk_data'
    - 'D:\\software\\anaconda3\\nltk_data'
    - 'D:\\software\\anaconda3\\share\\nltk_data'
    - 'D:\\software\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\AdamSmith\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Stemming Words

In [10]:
# Load library
from nltk.stem.porter import PorterStemmer
# Create word tokens
tokenized_words = ['i', 'am', 'humbled', 'by', 'this',
                   'traditional', 'meeting']
# Create stemmer
porter = PorterStemmer()
# Apply stemmer
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

## Tagging Parts of Speech

In [11]:
from nltk import pos_tag
from nltk import word_tokenize
# Create text
text_data = "Chris loved outdoor running"
# Use pre-trained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))
# Show parts of speech
text_tagged

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\AdamSmith/nltk_data'
    - 'D:\\software\\anaconda3\\nltk_data'
    - 'D:\\software\\anaconda3\\share\\nltk_data'
    - 'D:\\software\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\AdamSmith\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


the meaning of tag:  

Tag|Part of Speech
:-:|:-:
NNP | Proper noun, singular
NN| Noun, singular or mass
RB | Adverb
VBD | Verb, past tense
VBG | Verb, gerund or present participle
JJ | Adjective
PRP | Personal pronoun

------

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer


tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]
# Create list
tagged_tweets = []
# Tag each word and each tweet
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
# Use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

NameError: name 'nltk' is not defined

In [16]:
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
# Get some text from the Brown Corpus, broken into sentences
sentences = brown.tagged_sents(categories='news')
# Split into 4000 sentences for training and 623 for testing
train = sentences[:4000]
test = sentences[4000:]
# Create backoff tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
# Show accuracy
trigram.evaluate(test)

LookupError: 
**********************************************************************
  Resource [93mbrown[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('brown')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/brown[0m

  Searched in:
    - 'C:\\Users\\AdamSmith/nltk_data'
    - 'D:\\software\\anaconda3\\nltk_data'
    - 'D:\\software\\anaconda3\\share\\nltk_data'
    - 'D:\\software\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\AdamSmith\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Encoding Text as a Bag of Words

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
bag_of_words, bag_of_words.toarray()

(<3x8 sparse matrix of type '<class 'numpy.int64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 array([[0, 0, 0, 2, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64))

In [19]:
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [21]:
count_2gram = CountVectorizer(ngram_range=(1, 2),
                              stop_words="english",
                              vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)
# View feature matrix
bag.toarray(),count_2gram.vocabulary_

(array([[2],
        [0],
        [0]], dtype=int64),
 {'brazil': 0})

## 6.9 Weighting Word Importance

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# Show tf-idf feature matrix
feature_matrix, feature_matrix.toarray()

(<3x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
         0.        , 0.4472136 , 0.        ],
        [0.        , 0.57735027, 0.        , 0.        , 0.        ,
         0.57735027, 0.        , 0.57735027],
        [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
         0.        , 0.        , 0.        ]]))

In [23]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}