# Different fundamental concepts for analysing strings and texts

In [1]:
# splitting strings on any of multiple delimiters: use of re.split()
import re
sample_string = 'hello money; transformers, etc;  money,   research, hahahaha'
result1 = re.split(r'[;,\s]\s*', sample_string) # separator is either a comma (,), semicolon(;)
# or whitespace followed by any amount of extra whitespace
# whenever the pattern is found, the entire match becomes the delimiter between whatever fields lie 
# on either side of the match 
# print(result1)

result2 = re.split(r'(;|,|\s)\s*', sample_string)
# print(result2)

print(result2[::1]) # start, stop, step. In this case, it is from beginning to end, counting by 1
delimiters = result2[1::1] + ['']
# print(delimiters)

print(result2[::2])
delimiters = result2[1::2] + ['']
# print(delimiters)

print(result2[::3])
delimiters = result2[1::3] + ['']
# print(delimiters)

['hello', ' ', 'money', ';', 'transformers', ',', 'etc', ';', 'money', ',', 'research', ',', 'hahahaha']
['hello', 'money', 'transformers', 'etc', 'money', 'research', 'hahahaha']
['hello', ';', 'etc', ',', 'hahahaha']


In [2]:
# matching text at the start or end of a string
import os
filenames = os.listdir('.')
result1 = [name for name in filenames if name.endswith(('.ipynb','ipynb_checkpoints'))] 
# use of a tuple iterable
print(result1)
print(any(name.endswith(('.ipynb','ipynb_checkpoints')) for name in filenames))
# use of any function and a tuple iterable

sample_file = 'samplefile.txt'
print(sample_file[-4:])
print(sample_file[:4])
print(sample_file[:])

if sample_file[-4:] == '.txt':
    print(str(True))
else: 
    print(str(False))

['.ipynb_checkpoints', 'Bag_of_Words.ipynb', 'Fundamentals on strings and texts (cleaning etc.).ipynb', 'Fundamentals on strings and texts.ipynb', 'Processing_text_data.ipynb', 'Text analysis- master_copy.ipynb', 'Text analysis- student_copy.ipynb', 'word2vec.ipynb']
True
.txt
samp
samplefile.txt
True


In [3]:
# matching and searching for text patterns
import string
import random
import re
letters = string.ascii_letters
value = random.randint(1,10e4)

new_string = ''
for i in range(value):
    new_string += random.choice(letters) 
    new_string += " "
# print(new_string)

a = random.choice(letters)
b = random.choice(letters)
# print(a)
result1 = new_string.find(a) # find() function only finds the FIRST OCCURRENCE
print(result1)

if new_string.startswith(b):
    print(str(True))
elif new_string.endswith(b):
    print(str(True))
else:
    print(str(False))
    
sample1 = '01/01/2019'
sample2 = 'Jan 1, 2019'
test_sample = []
test_sample.append(sample1)
test_sample.append(sample2)
print(test_sample)
clean_test = re.compile(r'\d+/\d+/\d+') #non-capture groups
clean_test2 = re.compile((r'\d+/\d+/\d+')) #capture groups - simplify subsequent processing of the 
#matched text as the contents of eqch group can be extracted individually

for i in range(len(test_sample)):
    if clean_test.match(test_sample[i]):
        print(str(True))
    else:
        print(str(False))
        
for j in range(len(test_sample)):
    if clean_test2.match(sample1 or sample2):
        print(str(True))
    else:
        print(str(False))

result2 = clean_test2.match(sample1)
print(result2.group(0))

80
False
['01/01/2019', 'Jan 1, 2019']
True
False
True
True
01/01/2019


In [4]:
# searching and replacing text
import re
import string 
import random 
letters = string.ascii_letters 
value = random.randint(1,10e4)

test_string = ''
for i in range(value):
    test_string += random.choice(letters)
    test_string += " "

modified_string = test_string.replace('A' or 'a', 'Alvin')
modified_string = modified_string.replace('m' or 'M', '$$$')
print(modified_string)

sample1 = '24/12/2019'
clean_test = re.compile(r'(\d+)/(\d+)/(\d+)')
result1 = clean_test.sub(r'\1-\2-\3', sample1)
print(result1)

J K J $$$ Q I P i j f X R r r q u $$$ U C U f o y O P h O Q v L G x q E W M E S I F U J p b R f $$$ j E Q h o w I J s d w q t g b h r N G c g n B j J b s D k w g R C g e g p n J k z n M W k v Q N z S x P I e K x Z Y r M D w G q F f J J p t E z D j r w n f Q J s Alvin M W O D k S Z h Alvin K E Z u O y B v P v S X x g D K v w N j S Y M I B q g S U N H L B p U E K J M s z C I Alvin M M e P j u r g P b g h a o a w k l V z p E X z B i c L t x n X i O h u t e c p a R e K G J h v u L y X o w e Q g B g C e o G n i O M v Y h $$$ $$$ N X W B g e S b P T Q Z J l T p u G E c w w v H w p n r y K N d w D O C Alvin B B U T V Z w o H H D U W G G v o W I g W i S S r K $$$ v a j E r T Y N U Y g o w C w d W k h h z P k $$$ z Alvin q I f n h s j u D D b Q p q e d G H G a y s H X q G D h V k e J W y T d j y r $$$ c x p q Y F j p N n r S n b J T k f P k O k N M W N u a L L u h F B j n L u v p b $$$ K q L P M x i U P x W p g $$$ a i D Y l M v n P Alvin S y x p b X r U P M g l O p p J a y o k v I V Y M T O S 

In [5]:
# searching and replacing case-insensitive text
import re
sample1 = 'smart python, cunning PyThOn, dumb PYTHON'
result1 =  re.findall('python', sample1, flags = re.IGNORECASE)
print(result1)
result2 = re.sub('python', 'elephant', sample1, flags = re.IGNORECASE)
print(result2)

# creating support function 

def support1 (word):
    def support2 (m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return support2

result3 = re.sub('python', support1('elephant'), sample1, flags = re.IGNORECASE)
print(result3)

['python', 'PyThOn', 'PYTHON']
smart elephant, cunning elephant, dumb elephant
smart elephant, cunning Elephant, dumb ELEPHANT


In [6]:
# specifying a regular expression for the shortest match
import re
clean_test = re.compile(r'\"(.*?)\"') # adding ? modifier after the * operator in the pattern foces the matching 
# algorithm to look for the possible shortest match 
test_sample = 'Python "is" very "fun" hahaha'
result1 = clean_test.findall(test_sample)
print(result1)

['is', 'fun']


In [7]:
# stripping unwanted characters from strings using strip()
import re
import random
import string
letters = string.ascii_letters
value = random.randint(1,10e4)

new_string = ''
for i in range(value):
    dummy = random.randint(1,2)
    if dummy == 1:
        new_string += " "
    elif dummy == 2:
        new_string += random.choice(letters)
result1 = new_string.strip() # remove whitespaces from the beginning or end of new_string
# print(result1)
result2 = new_string.lstrip() # remove whitespaces from the left side of new_string
# print(result2)
result3 = new_string.rstrip() # remove whitespaces from the right side of new_string
# print(result3)

result4 = new_string.replace(' ', '')
print(result4)
result5 = ((re.sub('\s+', ' ',new_string)).lstrip()).rstrip()
print(result5)

# Can you observe the differences? :)

zSuPWfpoqxrsgIeyUfBnXEMSdpIslgsFrYGWWlYJdtGdimFgFObhyPwcUnLoioXXYPAkEKUbsFyarBeGitKaLFhfxnyPPsErBwDpuQDxWgswvQAKzpvyMKdFeUcEwzgbPHZIFsWQNdkBCidyqrzrlKrzEkwfKxgAYIXdORyASxImOrBaizpPbbXUtfhnMMhJHqOyDDsYJvuAZXsKCynphrrmyNrJXXnUWOOotMFkVsNYQXmoUkAmwbXWOsACduBjyOCOpvtSeDIbqioVKvYisaARhaYfyunuvXpqWwsIywWnRvwYtSsENabeflKgfcUEfGjqyuYDuiLuGhhZCvkoyjjWGTLWltjSNYvFGsnUyBukhNzsxEPAvViHEovPmQgKxwqjCbwevNxbaClolzBQHFVvMuOEexejyfYGkCXvYnrIUkUARPmEwauOukidmNIHvKxBnRbiokImiViKalronmQECqStmNjAhpsxlatupHsfLJxIbUAKhtQzzmOHFHRWvBuASyCgUheNFLOpJdejHdEIwMNgDXnZFZwSkysZECJxSxmvOtCzWqcvhvMQZgGKOamYuRIzVfOgzuIwrHuBGYmuToaXHywideIyrHTZIHqPxczGdtpgENxkFSglspMCauNjifgHaheixkraaIaXgNEMHVZixLftNaqZVfCCpAlLoHnqtwVdXwrsHGtxmXHmlmZmUggyBGSghWMLveBsyRBOJqhglpuMBlUlAhctgHnHudDYqVRkXDtNQkQLEcvlkpTrcrZgYALfkWyCvdnksCxKOntMgnYULSyZtNMZUGDZwelTazEJtNuuuApWmZyjIZGKUCWGZRwbcFWmRWrOkACWSNvfIoSnvPehnmZLFsqRTPpxqkEyHpXVcIctAYQvJtdMacawOfijhcTxalQCfrlSEJCnsdxSjUQAkjaJqjVvWkPNwALJzzuVQQoBcpRzcaSXzQRWATsjfaEIdvrnRVZpuLNIUarDoGzdVfLZ

In [8]:
# sanitizing and cleaning up text using translate()
import re
import random
import string
letters = string.ascii_letters
value = random.randint(1,10e4)

new_string = ''
for i in range(value):
    dummy = random.randint(1,3)
    if dummy == 1:
        new_string += " "
    elif dummy == 2:
        new_string += random.choice(letters)
    else:
        new_string += str(random.randint(1,value))

remap = {
    ord('a'): 'alvin',
    ord('p'): 'python',
    ord('1'): '$'
}
result1 = new_string.translate(remap)
print(result1)

gvr57289Zqof e$900$YF W 579884$620b X56848 2$353K  2389$H268$2 M 524$346873D546932$405 pythonk H 46370mW4564533899 27585$790038280TU      2$879u A Fq 6536224$8OF vYfdi  pythonVOalvin 30598q  $9396z   44966f$827WK  55985 58290  98334$694x52023900745996L5337o$39203643338278$79354$272l 23870oX $3884 2678926$7   z5693548582905724490L 3$003h7994  rN 48$0229294 978044$40f 2475bBx He VS 99382759223348 45264 2090$35582x  lK   XRuBB 58553M   48575PM32095  I54094 53528  dR K  Z495$042955  b$004924638$3366oC g202$3C$7$79 W Z58$$5 A59675e  4$97326587 5408$ $24505322425724  4323825679$4406qg59673 v48$8tQf$$2666202O8697python Z3076050735x$7776Ll  30895G$$844$4$89 python$908$34240 uM  H2$84553962 sX44$25C3673$Gf 26$$2OkP  U 40$23v583$8382329458W22900mEbR   G qWe38237G$240320523h k39209 $7834  rL$3358ve22659 5070$2935$  u32464 4$02$9668    42624  38982 x 53233K5657Z6432 r T 40995R54$50 489$8B 54835 $$9264$7$3 2$$04563$447$03  python 45774 40980Wd3544$  u43530 36882  63$327400lk3824$f2555547067G g  708

In [9]:
# aligning text strings using ljust(), rjust(), center() etc.
new_string = 'alvin enjoys doing python codes'
result1 = new_string.center(100)
print(result1)
result2 = new_string.ljust(100)
print(result2)
result3 = new_string.rjust(100)
print(result3)

result4 = new_string.center(100,"$")
print(result4)
result5 = new_string.ljust(100,"$")
print(result5)
result6 = new_string.rjust(100,"$")
print(result6)

result7 = format(new_string, '>100')
print(result7)
result8 = format(new_string, '<100')
print(result8)
result9 = format(new_string, '^100')
print(result9)

result10 = format(new_string, '$>100')
print(result10)
result11 = format(new_string, '%<100')
print(result11)
result12 = format(new_string, '!^100')
print(result12)

result13 = '{:<50s} {:>50s}'.format('alvin','python')
print(result13)
dummy = 888.88888
result14 = format(dummy, '>10')
print(result14)
result15 = format(dummy, '>50.4f')
print(result15)


                                  alvin enjoys doing python codes                                   
alvin enjoys doing python codes                                                                     
                                                                     alvin enjoys doing python codes
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$alvin enjoys doing python codes$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
alvin enjoys doing python codes$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$alvin enjoys doing python codes
                                                                     alvin enjoys doing python codes
alvin enjoys doing python codes                                                                     
                                  alvin enjoys doing python codes                                   
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$alvin enjoys doing pyt

In [10]:
# combining and concatenating strings using join()
sample_list = ['alvin', 'loves', 'python']
result1 = ' '.join(sample_list)
print(result1)
result2 = ''.join(sample_list)
print(result2)
result3 = ','.join(sample_list)
print(result3)
result4 = ','.join(str(item) for item in sample_list)
print(result4)

sample_text1 = 'alvin'
sample_text2 = 'love python'
result5 = sample_text1 + ' ' + sample_text2
print(result5)
print(sample_text1, sample_text2, sep=' ')

alvin loves python
alvinlovespython
alvin,loves,python
alvin,loves,python
alvin love python
alvin love python


In [11]:
# interpolating variables in strings
import sys
new_string = '{name1} enjoys doing {name2} for {n} days'

class info:
    def __init__(self,name1,name2,n):
        self.name1 = name1
        self.name2 = name2
        self.n = n
        
class safesub(dict):
    def __missing__(self,key):
        return '{' + key + '}'

result1  = info('Alvin','python', 21)
result2 = new_string.format_map(vars(result1))
print(result2)


def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))
name1 = 'Alvin'
# name2 = 'C++'
# n = 21
print(sub('{name1} enjoys doing {name2} for {n} days'))

Alvin enjoys doing python for 21 days
Alvin enjoys doing {name2} for {n} days


In [12]:
# reformatting text to a fixed number of columns using textwrap
import random
import re 
import string
import textwrap
letters = string.ascii_letters
value = random.randint(1,10e4)

new_string = ''
for i in range(value):
    new_string += random.choice(letters)

column_size = 50
print(textwrap.fill(new_string,column_size))
print(textwrap.fill(new_string,column_size, initial_indent = ' '))
print(textwrap.fill(new_string,column_size, subsequent_indent = ' '))

COaHyiBjVwPOYeXxBEGHIkawNrsEkwjgNEftAWlFYKHytMINqD
TkiLhkaDaGgdmDZqGaXbJvgzevDNQmuxVhfGZcJKvnylImlKoz
GwTZVuKhnthZshzkrOnbXwiMUZkQhPCEAMYjKqVuWQMUQlRPia
sAKGSxAOWyqRpDYtkmEDcjOqCBuObEfUMixPEmrvhZGNkNxUYv
jvURppqSvLmbxZACqrsduguiayeWTzRHMZdDBGlAKKcDQgzDwg
dnImKAcGdHTcYXSJKkIYFDOxywHPIUjKVAmgyNPBOLEMmoerRC
dcGPlIabopdKTSrqpkvFNFDnZRXcbxrGkTPMhGWhriPGSqzQJB
zepjxWeDEVVIvozYTnMBcTgESyFXluNqGdzUoWjyUAoutTxxkU
XLHomEKUnQRfFqMYyhzEEyzplZmyOYtoIomnrBtFGZiIkcRQmq
TysRWfjWkQifTqfrSYmsReNQiUSEIQoUueWxORaiszvKfJEFOB
xUPaocnatuAXGmHytzsViHFoBmQcwyunQLRWtYeVRKWfnCbwzD
cuefKWAyZUcyYxjLRaCPzyNCZvqzIahaZEqfYvwnChOfZGDSWC
yegKyaUOkwKmYafWUkeLjDrmpbxmNjBZCAWZFMCrhPMimIQNIN
nHQLdYdixwyzzbPOjAIeruadXRmyZCXrpLCVbGTvkLiaTtOADc
PrYkjRYOhShjpkWORsEHNFRqqzrvPDxtiYkCzFWWTojejbhJsv
TDlKAUAGFfEpZtwfiqiNikeQrGFMtdtkpKIjbrUmyIMlWsCiuG
NEvBjxUkFmoYuVontqGASYugZNEnQAVXGGWlSwgijEauwyCbEc
OyBJooOIcxaiTGoNFCLXphILAdWzfRFevtsLuacSADrrNVPpkg
RCMvvundsJVHHPxrhFlbiVRlMOuPJmIBDWdduLtqOPFYeFbfkm
IlqdJbCVIbviyrudaysgbxrliAUSXRP

In [13]:
# tokenizing text (parse text from left to right into stream of tokens)
import re
from collections import namedtuple

sample_text = 'Money = infinity_$ + happiness2 + python ** 10'
OBJECT = r'(?P<OBJECT>[a-zA-Z_][a-zA-Z_0-9]*)'
NUMBER = r'(?P<NUMBER>\d+)'
SYMBOLS1 = r'(?P<SYMBOLS>\_,)'
SYMBOLS2 = r'(?P<SYMBOLS2>\+,)'
SYMBOLS3 = r'(?P<SYMBOLS3>\*,)'
SYMBOLS4 = r'(?P<SYMBOLS4>\**,)'

token_text = re.compile('|'.join([OBJECT,NUMBER,SYMBOLS1,SYMBOLS2,SYMBOLS3,SYMBOLS4]))
token = namedtuple('token',['type','value'])

def generate_token(pat, sample_text):
    scanner = pat.scanner(sample_text)
    for m in iter(scanner.match,None):
        yield token(m.lastgroup,m.group())
    
for tok in generate_token(token_text, 'happiness2'):
    print(tok)

token(type='OBJECT', value='happiness2')


In [14]:
# performing text operations on Byte strings (support most of the same built-in operations for text strings)
import random
import re 
import string
import textwrap
letters = string.ascii_letters
value = random.randint(1,10e4)

new_string = ''
for i in range(value):
    new_string += random.choice(letters)
    new_string += ' '
    new_string += random.choice('!@#$%^&*()_')
byte_string = str.encode(new_string,'utf-8')
# print(byte_string)
print(byte_string[0]) # note that byte strings produce integers when calling their index. unlike text strings
nicer_byte_string = byte_string.decode('ascii') # remove the byte symbol
# print(nicer_byte_string) 

result1 = byte_string.replace(b'a', b'$$$')
# print(result1)
result2 = byte_string.split()
# print(result2)
result3 = re.split(b'[!@%^]',byte_string) # removing !@&* 
# print(result3)

104


In [15]:
# Removing punctuations (period, question mark, exclamation point, comma, semicolon, colon, dash, hyphen, 
# parentheses, brackets, braces, apostrophe, quotation marks, and ellipsis)
import unicodedata
import sys

sample_text = ['!!!hello_money_.....','alvin_money ok !!!']
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

result = [item.translate(punctuation) for item in sample_text]
print(result)

['hellomoney', 'alvinmoney ok ']


In [16]:
# Tokenizing text -> break up into individual words
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize # nltk stands for Natural Language Toolkit for Python
import random
import re 
import string
import textwrap
letters = string.ascii_letters
value = random.randint(1,10e4)

# tokenize into individual words
new_string = ''
for i in range(value):
    dummy = random.randint(0,1)
    if dummy == 0:
        new_string += random.choice(letters) 
    else:
        new_string += ' '
modified_string = word_tokenize(new_string)
print(modified_string)

# tokenize into individual sentences
from nltk.tokenize import sent_tokenize
new_string = ''
for i in range(value):
    dummy = random.randint(0,2)
    if dummy == 0:
        new_string += random.choice(letters) 
    elif dummy == 1:
        new_string += ' '
    else:
        new_string += '.'
modified_string2 = sent_tokenize(new_string)
print(modified_string2)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alvin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['Ih', 'P', 'y', 'Cx', 'PFPa', 'w', 'X', 'yL', 'N', 'oS', 'bjJf', 'j', 'nUhn', 'uEtBr', 'W', 'l', 'kdaI', 'l', 'Bc', 'k', 'K', 'uksGJENiFIuSz', 'c', 'hc', 'x', 'i', 'X', 'pYmRjk', 'hs', 'Q', 'KW', 'O', 'B', 'I', 'W', 'm', 'j', 'D', 'TPatQRs', 'F', 'f', 'CMs', 'i', 'H', 'uFwWFHsUY', 'Ir', 'tK', 'sE', 'CIH', 'sZkJao', 'j', 'V', 'Wxxe', 'dKL', 'Nfd', 'wqt', 'g', 'fDh', 'Slu', 'MPjIi', 'WK', 'BOj', 'i', 'qSdW', 'Mc', 'hD', 'c', 'yg', 'u', 'IM', 'M', 'aGP', 'OW', 'q', 'ak', 'v', 'YI', 'jAmT', 'g', 'ZVz', 'E', 'x', 'oBp', 'blw', 'DG', 'Sc', 'NIWEB', 'j', 'U', 'Hns', 'TJ', 'z', 'SOGAL', 'I', 'VX', 'Qt', 'dkP', 'b', 'kUpM', 'UO', 'tnZ', 'JnsC', 'W', 'U', 'z', 'o', 'o', 'LX', 'g', 'P', 'Qr', 'EM', 'U', 'VF', 'Y', 'ZT', 'C', 'fz', 'JX', 'G', 'j', 'X', 'A', 'Vb', 'Z', 'nqc', 'i', 'U', 'r', 'vSr', 's', 'q', 'Sn', 'dqR', 'NhN', 'XU', 'ag', 'e', 'S',

['L      .', 'T WQ.', '.rgQ  g ...  .', 'Wb.', '.j  qTp.FI i.  .', '.n   nxxt  Fv.....Dv y c.K  a ..RJC.. t .. pg .a..Z.f.K .P ..  S. .', 'h O .a.', '.', '.', 'o F.. z.Q .', '...  .', '..vO s vX..  gt .', 'iu.. .. E...Rj A.Gy .J.', 'TZ  ..  .', 'r.i .', '..j.WD..   h. .R..e .e KNk.. .', 'olMf .w.', '.. A.qN.. Y  IF...k ...R j  .', '... .. a..Q.', '.iRv..QE.', '.', '.', 'N ..N. .', 'BK...A .g..C..M ..   ...syk i R.J.  u.hr .', '.', '.RY..e  ..D .', 'S .. .', 'l.. IEA  R..r..vkp.E.j..II .r  zcft.', 'D F    ...PE.W.GPk.', '.. Oh  .p....z X PZ.', '.', '..b. MMJ .O..x. vGSr    ..E  R  .', 'y.x  .y ..Hl .', 'Bh.', '.zzc.', '.', '.....Za ..   .P mX.cpZ.', '.', '.Q..... .', '.', '.j.fKWs..l.  W  .a  h S .', '.Q .ZlT .', '.W    .', 'F.Y.L lkpZ   .sES .', 'oK .m  h j.', '.... F. ..R..I.PT  zKS..  .zLi.', '.m  .. ..dS .. .', '.', '.', '.C.', '.', 'p  jw B.LV Sv.Hk .', 'Q    J ...  .', 'lS     M...h..f... O..y.. hk.', 'dFuu.U..S hZCIc   wDqIj.', '.. mp .', '.', 'K l .', 'E Qv..xc  .. .... h  .', '

In [17]:
# Removing stop words -> remove extremely common words ('a','is' etc.) which contain little informational value
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('English')
#print(stop_words)

sample_words = ['alvin', 'is', 'very', 'happy']
result = [word for word in sample_words if word not in stop_words]
print(result)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alvin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['alvin', 'happy']


In [18]:
# Stemming words -> converting tokenized words into their root forms, i.e. removing their affixes while retaining their meaning
from nltk.stem.porter import PorterStemmer
tokenzied_words = ['i','are','singing','traditional','dancing']
porter = PorterStemmer()
result = [porter.stem(word) for word in tokenzied_words]
print(result)

['i', 'are', 'sing', 'tradit', 'danc']


In [19]:
# Encoding text as a bag of words -> create a set of features from text data which indicate the number of times an
# observation's text contains a particular word
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

sample_text = np.array(['Alvin likes python coding',
                        'Research is fun and exciting',
                        'Asia rocks',
                        'Games rocks'])

count = CountVectorizer()
bag_of_words = count.fit_transform(sample_text)
print(bag_of_words.toarray())
print(count.get_feature_names())

# By default, every feature is a word. We can instead set every feature to be a combination of two words (2-gram) 
# or even three words (3-gram)

count_2gram = CountVectorizer(ngram_range=(1,2),
                             stop_words = "english",
                             vocabulary = ["python","fun and","rocks"])
result = count_2gram.fit_transform(sample_text)
print(result.toarray())
print(count_2gram.vocabulary_)

[[1 0 0 1 0 0 0 0 1 1 0 0]
 [0 1 0 0 1 1 0 1 0 0 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 0 0 1]]
['alvin', 'and', 'asia', 'coding', 'exciting', 'fun', 'games', 'is', 'likes', 'python', 'research', 'rocks']
[[1 0 0]
 [0 0 0]
 [0 0 1]
 [0 0 1]]
{'python': 0, 'fun and': 1, 'rocks': 2}


In [20]:
# Weighting word importance -> to create a bag of words which words are weighted by their importance to an observation
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer

sample_text = np.array(['Alvin likes python coding',
                        'Research is fun and exciting',
                        'Asia rocks',
                        'Games rocks'])

# term frequencey (tf) -> a frequently appearing word in a particular document which indicates its general importance
# document frequency (df) -> a frequently appearing word in many documents which indicates its general unimportance
# tf-idf (t,d) = tf(t,d) x idf(t); t is a word and d is a document
# scikit-learn normalizes the tf-idf vectors using the Euclidean norm (L2 norm)

# tf = number of times a word appears in the document
# idf = log [(1 + n_d)/(1 + df(d,t))] +1 ; n_d is the number of documents and df(d,t) is the term's document frequency 
# (number of documents where the term appears)

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(sample_text)
print(feature_matrix.toarray())
print(tfidf.vocabulary_)

[[0.5        0.         0.         0.5        0.         0.
  0.         0.         0.5        0.5        0.         0.        ]
 [0.         0.4472136  0.         0.         0.4472136  0.4472136
  0.         0.4472136  0.         0.         0.4472136  0.        ]
 [0.         0.         0.78528828 0.         0.         0.
  0.         0.         0.         0.         0.         0.6191303 ]
 [0.         0.         0.         0.         0.         0.
  0.78528828 0.         0.         0.         0.         0.6191303 ]]
{'alvin': 0, 'likes': 8, 'python': 9, 'coding': 3, 'research': 10, 'is': 7, 'fun': 5, 'and': 1, 'exciting': 4, 'asia': 2, 'rocks': 11, 'games': 6}
