## Python Coding Style

In [2]:
'''
use four spaces per indentation level.
avoid tabs for indentation
Lines should be less than 80 characters long
'''

'\nuse four spaces per indentation level.\navoid tabs for indentation\nLines should be less than 80 characters long\n'

## Procedural vs Declarative Style

In [None]:
import nltk
tokens = nltk.corpus.brown.words(categories='news')

In [17]:
''' procedural style
as the machine does
like cpu registers '''
import time
start_time = time.time()
count = 0
total = 0
for token in tokens:
    count += 1
    total += len(token)
avg = total / count
print("%.2f --- %.4f seconds ---" % ( avg, (time.time() - start_time)) )

4.40 --- 0.5021 seconds ---


In [18]:
''' declarative
Implementation details are left to the Python interpreter '''
start_time = time.time()
total = sum(len(t) for t in tokens)
avg = total / len(tokens)
print("%.2f --- %.4f seconds ---" % ( avg, (time.time() - start_time)) )

4.40 --- 0.4704 seconds ---


In [None]:
start_time = time.time()
word_list = []
i = 0
while i < len(tokens):
    j = 0
    while j < len(word_list) and word_list[j] <= tokens[i]:
        j += 1
    if j == 0 or tokens[i] != word_list[j-1]:
        word_list.insert(j, tokens[i])
    i += 1
print("--- %.4f seconds ---" % (time.time() - start_time))

In [20]:
start_time = time.time()
word_list = sorted(set(tokens))
print("--- %.4f seconds ---" % (time.time() - start_time))

--- 0.4685 seconds ---


In [25]:
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()]
''' enunmerate to produce duple ( index, (word,freq )
index + 1 used as numeration for ranking '''
for rank, word in enumerate(most_common_words):
    cumulative += fd.freq(word)
    print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
    if cumulative > 0.25:
        break

  1   5.40% the
  2  10.42% ,
  3  14.67% .
  4  17.78% of
  5  20.19% and
  6  22.40% to
  7  24.29% a
  8  25.97% in


In [None]:
text = nltk.corpus.gutenberg.words('milton-paradise.txt')

In [31]:
start_time = time.time()
longest = ''
''' it's temping to use a variable to store max or min '''
for word in text:
    if len(word) > len(longest):
        longest = word
print("%s --- %.4f seconds ---" % (longest, (time.time() - start_time)) )

unextinguishable --- 0.2505 seconds ---


In [30]:
start_time = time.time()
maxlen = max(len(word) for word in text)
[word for word in text if len(word) == maxlen]
print("--- %.4f seconds ---" % ( (time.time() - start_time)) )

--- 0.4259 seconds ---


## Some Legitimate Uses for Counters

In [34]:
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']

In [33]:
len(sent)

6

In [35]:
n = 3
[sent[i:i+n] for i in range(len(sent)-n+1)]

[['The', 'dog', 'gave'],
 ['dog', 'gave', 'John'],
 ['gave', 'John', 'the'],
 ['John', 'the', 'newspaper']]

In [36]:
range(6-n+1)

range(0, 4)

In [38]:
print([i for i in range(6-n+1) ])

[0, 1, 2, 3]


In [39]:
sent[0:3]

['The', 'dog', 'gave']

In [41]:
sent[1:4]

['dog', 'gave', 'John']

In [42]:
sent[2:5]

['gave', 'John', 'the']

In [43]:
sent[3:6]

['John', 'the', 'newspaper']

In [51]:
from nltk.util import trigrams
text = nltk.Text(sent)
tg = trigrams(text)

In [50]:
from nltk.util import ngrams
n = 3
ng = ngrams(text, n)

In [56]:
import pprint
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
array[2][5].add('Alice')
pprint.pprint(array)

[[set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), {'Alice'}, set()]]


In [58]:
# ERROR
array = [[set()] * n] * m
# object copying
array[2][5].add(7)
pprint.pprint(array)

[[{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}]]
