In [20]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(content, n):
    content = content.split(' ')
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
        return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams[:10])
print('2-grams count is: '+str(len(ngrams)))

[['General-purpose', 'programming']]
2-grams count is: 1


In [21]:
import re

def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

In [22]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams[:10])
print('2-grams count is: '+str(len(ngrams)))

[['General-purpose', 'programming'], ['programming', 'language'], ['language', 'PythonParadigmMulti-paradigm:'], ['PythonParadigmMulti-paradigm:', 'object-oriented,'], ['object-oriented,', 'procedural'], ['procedural', '(imperative),'], ['(imperative),', 'functional,'], ['functional,', 'structured,'], ['structured,', 'reflectiveDesignedbyGuido'], ['reflectiveDesignedbyGuido', 'van']]
2-grams count is: 12215


In [23]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)
        


In [24]:
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
print(len(getNgrams(content, 2)))

9216


In [25]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

In [26]:
#print(getNgrams(content, 2))

In [27]:
min_threshold = 10
newdict = {x: count for x, count in sorted(getNgrams(content, 2).items(), key=lambda item: -item[1]) if count >= min_threshold}

In [28]:
print(newdict)

{'FROM THE': 218, 'THE ORIGINAL': 209, 'ORIGINAL ON': 207, 'ARCHIVED FROM': 204, 'ON JUNE': 60, 'SOFTWARE FOUNDATION': 38, 'PYTHON SOFTWARE': 38, 'OF THE': 36, 'OF PYTHON': 33, 'IN PYTHON': 33, 'RETRIEVED FEBRUARY': 30, 'RETRIEVED MARCH': 28, 'IN THE': 25, 'RETRIEVED JANUARY': 23, 'SUCH AS': 22, 'AS A': 21, 'THE PYTHON': 21, 'ON MAY': 21, 'VAN ROSSUM': 20, 'IS A': 19, 'PROGRAMMING LANGUAGE': 18, 'ON OCTOBER': 18, 'RETRIEVED NOVEMBER': 18, 'ON DECEMBER': 17, 'RETRIEVED JUNE': 17, 'ON APRIL': 17, 'RETRIEVED JULY': 16, 'RETRIEVED APRIL': 16, 'TO THE': 15, 'ON MARCH': 15, 'CAN BE': 14, 'BE USED': 14, 'PYTHON ENHANCEMENT': 14, 'RETRIEVED MAY': 14, 'RETRIEVED DECEMBER': 14, 'ON FEBRUARY': 14, 'PYTHON IS': 13, 'TO BE': 13, 'FOR PYTHON': 13, 'ENHANCEMENT PROPOSALS': 13, 'RETRIEVED SEPTEMBER': 13, 'IT IS': 12, 'STANDARD LIBRARY': 12, 'ON AUGUST': 12, 'ROSSUM GUIDO': 12, 'ON JANUARY': 12, 'TO PYTHON': 11, 'OF A': 11, 'STATEMENT WHICH': 11, 'USED TO': 11, 'WITH THE': 10, 'OTHER LANGUAGES': 10, 'R