In [28]:
sentence="Thomas Jefferson began building Monticello at the age of 26"
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [29]:
import numpy as np
token_sequence=str.split(sentence)
vocab=sorted(set(token_sequence))
', '.join(vocab)

'26, Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [30]:
num_tokens=len(token_sequence)
vocab_size=len(vocab)
onehot_vectors=np.zeros((num_tokens,vocab_size),int)
for i,word in enumerate(token_sequence):
    onehot_vectors[i,vocab.index(word)]=1
", ".join(vocab)


'26, Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [31]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [32]:
import pandas as pd
pd.DataFrame(onehot_vectors,columns=vocab)

Unnamed: 0,26,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [33]:
sentence_bow={}
for token in sentence.split():
    sentence_bow[token]=1
sorted(sentence_bow.items())

[('26', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [34]:
df=pd.DataFrame(pd.Series(dict([(token,1) for token in sentence.split()])),columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26
sent,1,1,1,1,1,1,1,1,1,1


## 标点符号的处理

In [35]:
import re
tokens=re.split(r'[-\s.,;!?]+',sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [36]:
pattern=re.compile(r"([-\s,.!?])+")
tokens=pattern.split(sentence)

[x for x in tokens if x and x not in '- \t\n,.;!?']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [37]:
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [38]:
from nltk.tokenize import TreebankWordTokenizer
sentence="Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [39]:
tokens=[x for x in tokens if x and x not in '- \t\n,.;!?']

In [40]:
from nltk.util import ngrams
list(ngrams(tokens,2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [41]:
two_grams=list(ngrams(tokens,2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [42]:
import nltk
nltk.download('stopwords')
stop_words=nltk.corpus.stopwords.words('english')
len(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [43]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
len(sklearn_stop_words)

318

## 情感 VADER一个基于规则的情感分析器


In [44]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa=SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [45]:
[(tok,score) for tok,score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [46]:
sa.polarity_scores(text="Python is very readable and it's great for nlp.")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [47]:
sa.polarity_scores(text='Pytho is not a bad choice for most applications.')

{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.431}

In [48]:
corpus={'Absolutely perfect! Love it! :-) :-) :-)','Horrible! Completely useless. :(','It was OK. Some good and some bad things.'}
for doc in corpus:
    scores=sa.polarity_scores(text=doc)
    print('{:+}: {}'.format(scores['compound'],doc))

-0.1531: It was OK. Some good and some bad things.
-0.8768: Horrible! Completely useless. :(
+0.9428: Absolutely perfect! Love it! :-) :-) :-)


In [53]:
from nltk.tokenize import TreebankWordTokenizer
sentence="""The fast Harry got to the store, the faster Harry, the fast, would get home."""
tokenizer=TreebankWordTokenizer()
tokens=tokenizer.tokenize(sentence.lower())

In [54]:
from collections import Counter
bag_of_words=Counter(tokens)
bag_of_words

Counter({'the': 4,
         'fast': 2,
         'harry': 2,
         'got': 1,
         'to': 1,
         'store': 1,
         ',': 3,
         'faster': 1,
         'would': 1,
         'get': 1,
         'home': 1,
         '.': 1})

In [56]:
bag_of_words.most_common(4)

[('the', 4), (',', 3), ('fast', 2), ('harry', 2)]

In [57]:
times_harry_appears=bag_of_words['harry']
num_unique_words=len(bag_of_words)
tf=times_harry_appears/num_unique_words
round(tf,4)

0.1667

In [58]:
kite_text=""" kite is traditionally a tethered heavier-than-air craft with wing surfaces that react against the air to create lift and drag. A kite consists of wings, tethers, and anchors. Kites often have a bridle to guide the face of the kite at the correct angle so the wind can lift it. A kite's wing also may be so designed so a bridle is not needed; when kiting a sailplane for launch, the tether meets the wing at a single point. A kite may have fixed or moving anchors. Untraditionally in technical kiting, a kite consists of tether-set-coupled wing sets; even in technical kiting, though, a wing in the system is still often called the kite.
The lift that sustains the kite in flight is generated when air flows around the kite's surface, producing low pressure above and high pressure below the wings. The interaction with the wind also generates horizontal drag along the direction of the wind. The resultant force vector from the lift and drag force components is opposed by the tension of one or more of the lines or tethers to which the kite is attached. The anchor point of the kite line may be static or moving (e.g., the towing of a kite by a running person, boat, free-falling anchors as in paragliders and fugitive parakites or vehicle).The same principles of fluid flow apply in liquids and kites are also used under water.
A hybrid tethered craft comprising both a lighter-than-air balloon as well as a kite lifting surface is called a kytoon.Kites have a long and varied history and many different types are flown individually and at festivals worldwide. Kites may be flown for recreation, art or other practical uses. Sport kites can be flown in aerial ballet, sometimes as part of a competition. Power kites are multi-line steerable kites designed to generate large forces which can be used to power activities such as kite surfing, kite landboarding, kite fishing, kite buggying and a new trend snow kiting. Even Man-lifting kites have been made."""
tokens=tokenizer.tokenize(kite_text.lower())
token_counts=Counter(tokens)
token_counts

Counter({'kite': 16,
         'is': 7,
         'traditionally': 1,
         'a': 19,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'with': 2,
         'wing': 5,
         'surfaces': 1,
         'that': 2,
         'react': 1,
         'against': 1,
         'the': 25,
         'air': 2,
         'to': 5,
         'create': 1,
         'lift': 4,
         'and': 10,
         'drag.': 1,
         'consists': 2,
         'of': 10,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 7,
         'often': 2,
         'have': 4,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'at': 3,
         'correct': 1,
         'angle': 1,
         'so': 3,
         'wind': 2,
         'can': 3,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'be': 5,
         'designed': 2,
         'not': 1,
         'needed': 1,
         ';': 2,
         'when': 2,


In [59]:
import nltk
nltk.download('stopwords',quiet=True)
stopwords=nltk.corpus.stopwords.words('english')
tokens=[x for x in tokens if x not in stop_words]
kite_counts=Counter(tokens)
kite_counts

Counter({'kite': 16,
         'traditionally': 1,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'wing': 5,
         'surfaces': 1,
         'react': 1,
         'air': 2,
         'create': 1,
         'lift': 4,
         'drag.': 1,
         'consists': 2,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 7,
         'often': 2,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'correct': 1,
         'angle': 1,
         'wind': 2,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'designed': 2,
         'needed': 1,
         ';': 2,
         'kiting': 3,
         'sailplane': 1,
         'launch': 1,
         'tether': 1,
         'meets': 1,
         'single': 1,
         'point.': 1,
         'fixed': 1,
         'moving': 2,
         'untraditionally': 1,
         'technical': 2,
         'tether-set-coupled': 1,
         'sets': 1,

In [61]:
document_vector=[]
doc_length=len(tokens)
for key,values in kite_counts.most_common():
    document_vector.append(values/doc_length)
document_vector

[0.07239819004524888,
 0.06787330316742081,
 0.03167420814479638,
 0.02262443438914027,
 0.01809954751131222,
 0.01809954751131222,
 0.013574660633484163,
 0.013574660633484163,
 0.013574660633484163,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.00904977375565611,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.004524886877828055,
 0.0045248868778