In [1]:
import re

In [2]:
corpus = r"""Breakfast options:
1. Egg and baaacon - £10.0 ($12.83);
2. Egg, sausage and baaaaacon - £11.0 ($12.83);
3. Egg and beans - £12.0 ($15.40);
4. Egg, baaacon and beans - £13.0 ($16.68);
5. Beans, sausage, beans - £14.0 ($17.97);
6. Beans, baaaaacon, beans, tomato and beans -
£15.0 ($19.25); """


In [3]:
for line in corpus.splitlines():
    line=re.sub('beans','spam',line.rstrip())
    print(line)

Breakfast options:
1. Egg and baaacon - £10.0 ($12.83);
2. Egg, sausage and baaaaacon - £11.0 ($12.83);
3. Egg and spam - £12.0 ($15.40);
4. Egg, baaacon and spam - £13.0 ($16.68);
5. Beans, sausage, spam - £14.0 ($17.97);
6. Beans, baaaaacon, spam, tomato and spam -
£15.0 ($19.25);


In [4]:
numlines=re.compile('^\d')
for line in corpus.splitlines():
    result=numlines.search(line)
    if result:
        print(line)

1. Egg and baaacon - £10.0 ($12.83);
2. Egg, sausage and baaaaacon - £11.0 ($12.83);
3. Egg and beans - £12.0 ($15.40);
4. Egg, baaacon and beans - £13.0 ($16.68);
5. Beans, sausage, beans - £14.0 ($17.97);
6. Beans, baaaaacon, beans, tomato and beans -


In [5]:
endparaen=re.compile('\)$')

In [7]:
endparaen

re.compile(r'\)$', re.UNICODE)

In [8]:
bword=re.compile("[Bb]\w+")

In [10]:
bword.findall(corpus)

['Breakfast',
 'baaacon',
 'baaaaacon',
 'beans',
 'baaacon',
 'beans',
 'Beans',
 'beans',
 'Beans',
 'baaaaacon',
 'beans',
 'beans']

In [12]:
#misspellied bacons
re.findall('ba+\w+',corpus)

['baaacon', 'baaaaacon', 'baaacon', 'baaaaacon']

In [13]:
re.findall('ba{3,5}con',corpus)

['baaacon', 'baaaaacon', 'baaacon', 'baaaaacon']

In [14]:
re.findall('£(\d+\.0)',corpus)

['10.0', '11.0', '12.0', '13.0', '14.0', '15.0']

In [18]:
re.findall('\(\$(\d+\.\d+)\)',corpus)

['12.83', '12.83', '15.40', '16.68', '17.97', '19.25']

In [None]:
#unicode

In [19]:
type("hello world")

str

In [21]:
type(b"hello!")

bytes

In [22]:
type(b"ありがとう")

SyntaxError: bytes can only contain ASCII literal characters (3055162717.py, line 1)

In [23]:
arigato="ありがとう"
bytes(arigato,'utf-8')

b'\xe3\x81\x82\xe3\x82\x8a\xe3\x81\x8c\xe3\x81\xa8\xe3\x81\x86'

In [24]:
bytes(arigato,'utf-32')

b'\xff\xfe\x00\x00B0\x00\x00\x8a0\x00\x00L0\x00\x00h0\x00\x00F0\x00\x00'

In [25]:
arigato.encode()

b'\xe3\x81\x82\xe3\x82\x8a\xe3\x81\x8c\xe3\x81\xa8\xe3\x81\x86'

In [28]:
japon=b"\xe3\x81\x82\xe3\x82\x8a\xe3\x81\x8c\xe3\x81\xa8\xe3\x81\x86"

In [29]:
japon.decode()

'ありがとう'

In [30]:
#tokenise

In [71]:
sentence="I am a really good person.I hav a pink purse.I am happy to inform u about my grade"

In [33]:
sentence.split()

['I',
 'am',
 'a',
 'really',
 'good',
 'person.I',
 'hav',
 'a',
 'pink',
 'purse.I',
 'am',
 'happy',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [34]:
sentence.split('.')

['I am a really good person',
 'I hav a pink purse',
 'I am happy to inform u about my grade']

In [35]:
import re

In [36]:
re.split('[,!\.\-\s]',sentence)

['I',
 'am',
 'a',
 'really',
 'good',
 'person',
 'I',
 'hav',
 'a',
 'pink',
 'purse',
 'I',
 'am',
 'happy',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [37]:
import nltk

In [40]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91821\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [42]:
nltk.word_tokenize(sentence)

['I',
 'am',
 'a',
 'really',
 'good',
 'person.I',
 'hav',
 'a',
 'pink',
 'purse.I',
 'am',
 'happy',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [44]:
nltk.sent_tokenize(sentence)

['I am a really good person.I hav a pink purse.I am happy to inform u about my grade']

In [46]:
words=nltk.word_tokenize(sentence.lower())

In [47]:
words

['i',
 'am',
 'a',
 'really',
 'good',
 'person.i',
 'hav',
 'a',
 'pink',
 'purse.i',
 'am',
 'happy',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [48]:
w="working,making,made,soap"

In [49]:
porter=nltk.PorterStemmer()

In [60]:
[porter.stem(word) for word in words]

['i',
 'am',
 'a',
 'realli',
 'good',
 'person.i',
 'hav',
 'a',
 'pink',
 'purse.i',
 'am',
 'happi',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [61]:
lemmatiser=nltk.WordNetLemmatizer()
[lemmatiser.lemmatize(word) for word in words]

['i',
 'am',
 'a',
 'really',
 'good',
 'person.i',
 'hav',
 'a',
 'pink',
 'purse.i',
 'am',
 'happy',
 'to',
 'inform',
 'u',
 'about',
 'my',
 'grade']

In [57]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91821\AppData\Roaming\nltk_data...


True

In [62]:
tweet="@oroio This is new parrpot!!!:(#sad .... https://jorel.com)"

In [63]:
nltk.word_tokenize(tweet)

['@',
 'oroio',
 'This',
 'is',
 'new',
 'parrpot',
 '!',
 '!',
 '!',
 ':',
 '(',
 '#',
 'sad',
 '....',
 'https',
 ':',
 '//jorel.com',
 ')']

In [65]:
pattern = r'''(?x) # A verbose regex
[\$£]?\d+[\.:%]?\d*%?
|(?:[A-Z]\.)+
|(?:https?://)?(?:\w+\.)(?:\w{2,})+(?:[\w/]+)?
|[@\#]?\w+(?:[-’]\w+)*
|\.\.\.
|[!?]+
|:[()]'''


In [66]:
nltk.regexp_tokenize(tweet,pattern)

['@oroio',
 'This',
 'is',
 'new',
 'parrpot',
 '!!!',
 ':(',
 '#sad',
 '...',
 'https://jorel.com']

In [67]:
#premade fn
from nltk.tokenize.casual import TweetTokenizer
TweetTokenizer().tokenize(tweet)

['@oroio',
 'This',
 'is',
 'new',
 'parrpot',
 '!',
 '!',
 '!',
 ':(',
 '#sad',
 '...',
 'https://jorel.com',
 ')']

In [74]:
s=nltk.sent_tokenize(sentence)
s1=nltk.word_tokenize(s[0])

In [77]:
nltk.pos_tag(s1)

[('I', 'PRP'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('really', 'RB'),
 ('good', 'JJ'),
 ('person.I', 'NN'),
 ('hav', 'VBZ'),
 ('a', 'DT'),
 ('pink', 'NN'),
 ('purse.I', 'NN'),
 ('am', 'VBP'),
 ('happy', 'JJ'),
 ('to', 'TO'),
 ('inform', 'VB'),
 ('u', 'JJ'),
 ('about', 'IN'),
 ('my', 'PRP$'),
 ('grade', 'NN')]

In [76]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91821\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True