In [1]:
import re #regular expression library
import math

In [2]:
def getwords(doc):
  splitter=re.compile('\W+') 

  # Split the words by non-alpha characters
  # Exclude words with a length 2 character or less or
  # greater than 20 (words longer the 20 letters are
  # likely to be either errors in the splitting or
  # so rare as to be useless for classifying
  words = [s.lower() for s in splitter.split(doc) 
                if len(s)>2 and len(s)<20]
  
  # Return the unique set of words only
  return dict([(w,1) for w in words])

In [3]:
getwords('this is a test sentence')

{'this': 1, 'test': 1, 'sentence': 1}

In [4]:

class classifier:
  def __init__(self, getfeatures, filename=None):
    # Counts of feature/category combinations
    self.fc = {}
    # Counts of documents in each category
    self.cc = {}
    self.getfeatures = getfeatures

# The fc variable will store the counts for different features in
# different classifications.
# - For example:
#   {'python': {'bad': 0, 'good': 6}, 'the': {'bad': 3, 'good': 3}}
# This indicates that the word “the” has appeared in documents
# classified as bad three times, and in documents that were
# classified as good three times. The word “Python”
# has only appeared in good documents.
# 
# The cc variable is a dictionary of how many times every
# classification has been used. This is needed for the probability
# calculations that we’ll discuss shortly.
# 
# The final instance variable, getfeatures, is the function that will
# be used to extract the features from the items being classified
# —in this example, it is the getwords function wejust defined.

# 
# We also need helper methods to increment and get the counts of
# the features
# 
    
  # Increase the count of a feature/category pair
  def incf(self,f,cat):
    self.fc.setdefault(f,{}) 
    self.fc[f].setdefault(cat,0)
    self.fc[f][cat]+=1

  # Increase the count of a category
  def incc(self,cat):
    self.cc.setdefault(cat,0)
    self.cc[cat]+=1

  # The number of times a feature has appeared in a category
  def fcount(self,f,cat):
    if f in self.fc and cat in self.fc[f]:
      return float(self.fc[f][cat])
    return 0.0

  # The number of items in a category
  def catcount(self,cat):
    if cat in self.cc:
      return float(self.cc[cat])
    return 0

  # The total number of items
  def totalcount(self):
    return sum(self.cc.values())

  # The list of all categories
  def categories(self):
    return self.cc.keys()


# The train method takes an item (a document in this case) and a
# classification.
# 
# It uses the getfeatures function of the class to break the item
# into its separate features.
# 
# It then calls incf to increase the counts for this classification
# for every feature.
# 
# Finally, it increases the total count for this classification:

  def train(self,item,cat):
    features=self.getfeatures(item)
    # Increment the count for every feature with this category
    for f in features:
      self.incf(f,cat)
    # Increment the count for this category
    self.incc(cat)
    
  def fprob(self,f,cat):
    if self.catcount(cat)==0: return 0
    # The total number of times this feature appeared in this 
    # category divided by the total number of items in this category
    return self.fcount(f,cat)/self.catcount(cat)


In [5]:
cl = classifier(getwords)
cl.train('the quick brown fox jumped over the lazy dog', 'good')

In [6]:
 cl.train('make quick money in the online casino', 'bad')

In [7]:
cl.fcount('quick','good')

1.0

In [8]:
cl.fcount('quick','bad')

1.0

In [9]:
cl.fcount('jumped','bad')

0.0

In [10]:
cl.train('Nobody owns the water. The quick rabbit jumps fences buy pharmaceuticals now make quick money at the online casino the quick brown fox jumps','good')

In [48]:
cl.fprob('quick','good')

1.0