# Documents and Words 
## docclass.py, getwords to extract the features from the text
* Source : https://gist.githubusercontent.com/unlimitedfocus/7200425/raw/c34f7ed8722e40a30caf6efe270504c29ecf7c20/gistfile1.txt

In [1]:
import re
import math
def getwords(doc):
    splitter=re.compile('\\W*')
    # Split the words by non-alpha characters
    words=[s.lower( ) for s in splitter.split(doc)
        if len(s)>2 and len(s)<20]
    
    # Return the unique set of words only
    return dict([(w,1) for w in words])
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
!pip install --upgrade absl-py

Looking in indexes: http://aicentro-nexus-svc.aicentro-system:8081/hub-nexus/repository/pypi-group/simple
Collecting absl-py
  Downloading http://aicentro-nexus-svc.aicentro-system:8081/hub-nexus/repository/pypi-group/packages/absl-py/0.10.0/absl_py-0.10.0-py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 90.4 MB/s eta 0:00:01
Installing collected packages: absl-py
  Attempting uninstall: absl-py
    Found existing installation: absl-py 0.9.0
    Uninstalling absl-py-0.9.0:
      Successfully uninstalled absl-py-0.9.0
Successfully installed absl-py-0.10.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# Training the Classifier 
## Create a class called classifier

In [2]:
class classifier:
    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc={}
        # Counts of documents in each category
        self.cc={}
        self.getfeatures=getfeatures
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()

In [3]:
cl = classifier(getwords)

In [4]:
cl.train('the quick brown fox jumps over the lazy dog','good')

In [5]:
cl.train('make quick money in the online casino','bad')

In [6]:
cl.fcount('quick','good')

1.0

In [7]:
cl.fcount('quick','bad')

1.0

In [8]:
cl.fcount('casino', 'good')

0.0

In [9]:
cl.fcount('casino', 'bad')

1.0

In [10]:
def sampletrain(cl):
    cl.train('Nobody owns the water.','good')
    cl.train('the quick rabbit jumps fences','good')
    cl.train('buy pharmaceuticals now','bad')
    cl.train('make quick money at the online casino','bad')
    cl.train('the quick brown fox jumps','good')

## Calculating Probabilities

In [11]:
class classifier2:
    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc={}
        # Counts of documents in each category
        self.cc={}
        self.getfeatures=getfeatures
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    

In [12]:
cl2 = classifier2(getwords)

In [13]:
sampletrain(cl2)

In [14]:
cl2.fprob('quick','good')

0.6666666666666666

### Starting with a Reasonable Guess

In [15]:
class classifier3:
    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc={}
        # Counts of documents in each category
        self.cc={}
        self.getfeatures=getfeatures
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp
    

In [16]:
cl3 = classifier3(getwords)

In [17]:
sampletrain(cl3)

In [18]:
cl3.weightedprob('money','good',cl3.fprob)

0.25

In [19]:
sampletrain(cl3)

In [20]:
cl3.weightedprob('money','good',cl3.fprob)

0.16666666666666666

## A Naïve Classifier you need a way to combine the individual word probabilities to get the probability that an entire document belongs in a given category.

Naïve Bayesian classifier: http://en.wikipedia.org/wiki/Naive_Bayes_classifier, Reference: http://blog.sragent.pe.kr/33

### Probability of a Whole Document

In [21]:
class naivebayes(classifier):
    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc={}
        # Counts of documents in each category
        self.cc={}
        self.getfeatures=getfeatures
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def docprob(self,item,cat):
        features=self.getfeatures(item)
        
        # Multiply the probabilities of all the features together
        p=1
        for f in features: p*=self.weightedprob(f,cat,self.fprob)
        return p
    
    def prob(self,item,cat):
        catprob=self.catcount(cat)/self.totalcount( )
        docprob=self.docprob(item,cat)
        return docprob*catprob
    
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp
    

### A Quick Introduction to Bayes’ Theorem

In [22]:
bl = naivebayes(getwords)

In [23]:
sampletrain(bl)

In [24]:
bl.prob('quick rabbit','good')

0.15624999999999997

In [25]:
bl.prob('quick rabbit','bad')

0.05

In [26]:
bl.prob('money casino','good')

0.0375

In [27]:
bl.prob('money casino','bad')

0.1

### Choosing a Category

In [28]:
class naivebayes2(classifier):
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.thresholds={}
        
    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
        
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def docprob(self,item,cat):
        features=self.getfeatures(item)
        
        # Multiply the probabilities of all the features together
        p=1
        for f in features: p*=self.weightedprob(f,cat,self.fprob)
        return p
    
    def prob(self,item,cat):
        catprob=self.catcount(cat)/self.totalcount( )
        docprob=self.docprob(item,cat)
        return docprob*catprob
    
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp
    
    def classify(self,item,default=None):
        probs={}
        # Find the category with the highest probability
        
        max=0.0
        for cat in self.categories( ):
            probs[cat]=self.prob(item,cat)            
            if probs[cat]>max:
                max=probs[cat]
            best=cat
        
        # Make sure the probability exceeds threshold*next best
        for cat in probs:
            if cat==best: continue
            if probs[cat]*self.getthreshold(best)>probs[best]: return default
        return best    

In [29]:
bl2 = naivebayes2(getwords)

In [30]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
!pip install --upgrade absl-py

Looking in indexes: http://aicentro-nexus-svc.aicentro-system:8081/hub-nexus/repository/pypi-group/simple
Requirement already up-to-date: absl-py in /usr/local/lib/python3.6/dist-packages (0.10.0)
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [31]:
sampletrain(bl2)

In [32]:
bl2.classify('quick rabbit',default='unknown')

'unknown'

In [33]:
bl2.classify('quick money',default='unknown')

'bad'

In [34]:
bl2.setthreshold('bad',3.0)

In [35]:
bl2.classify('quick money',default='unknown')

'unknown'

In [36]:
bl2.classify('quick money',default='unknown')

'unknown'

In [37]:
for i in range(10):
    sampletrain(bl2)

In [38]:
bl2.classify('quick money',default='unknown')

'bad'

### The Fisher Method 

#### alternative method that’s been shown to give very accurate results, particularly for spam filtering.

### Category Probabilities for Features

In [39]:
class fisherclassifier(classifier):
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.thresholds={}
        
    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
        
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def cprob(self,f,cat):
        # The frequency of this feature in this category
        clf=self.fprob(f,cat)
        if clf==0: return 0
        
        # The frequency of this feature in all the categories
        freqsum=sum([self.fprob(f,c) for c in self.categories( )])
        
        # The probability is the frequency in this category divided by
        # the overall frequency
        p=clf/(freqsum)
        return p
    
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp

In [40]:
fl = fisherclassifier(getwords)

In [41]:
sampletrain(fl)

In [42]:
fl.cprob('quick','good')

0.5714285714285715

In [43]:
fl.cprob('money','bad')

1.0

In [44]:
fl.weightedprob('money','bad',fl.cprob)

0.75

### Combining the Probabilities

In [45]:
def fisherprob(self,item,cat):
    # Multiply all the probabilities together
    p=1
    features=self.getfeatures(item)
    for f in features:
        p*=(self.weightedprob(f,cat,self.cprob))
        
    # Take the natural log and multiply by -2
    fscore=-2*math.log(p)
    
    # Use the inverse chi2 function to get a probability
    return self.invchi2(fscore,len(features)*2)

In [46]:
def invchi2(self,chi,df):
    m = chi / 2.0
    sum = term = math.exp(-m)
    for i in range(1, df//2):
        term *= m / i
        sum += term
    return min(sum, 1.0)

In [47]:
class fisherclassifier2(classifier):
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.thresholds={}
        
    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
        
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def cprob(self,f,cat):
        # The frequency of this feature in this category
        clf=self.fprob(f,cat)
        if clf==0: return 0
        
        # The frequency of this feature in all the categories
        freqsum=sum([self.fprob(f,c) for c in self.categories( )])
        
        # The probability is the frequency in this category divided by
        # the overall frequency
        p=clf/(freqsum)
        return p
    
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp
    
    def fisherprob(self,item,cat):
        # Multiply all the probabilities together
        p=1
        features=self.getfeatures(item)
        for f in features:
            p*=(self.weightedprob(f,cat,self.cprob))
            
        # Take the natural log and multiply by -2
        fscore=-2*math.log(p)
        
        # Use the inverse chi2 function to get a probability
        return self.invchi2(fscore,len(features)*2)

    def invchi2(self,chi,df):
        m = chi / 2.0
        sum = term = math.exp(-m)
        for i in range(1, df//2):
            term *= m / i
            sum += term
        return min(sum, 1.0)

In [48]:
fl2 = fisherclassifier2(getwords)

In [49]:
sampletrain(fl2)

In [50]:
fl.cprob('quick','good')

0.5714285714285715

In [51]:
fl2.fisherprob('quick rabbit','good')

0.78013986588958

In [52]:
fl2.fisherprob('quick rabbit','bad')

0.35633596283335256

### Classifying Items

In [53]:
class fisherclassifier3(classifier):
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.minimums={}
    
    def setminimum(self,cat,min):
        self.minimums[cat]=min
    
    def getminimum(self,cat):
        if cat not in self.minimums: return 0
        return self.minimums[cat]

    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
        
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]
        
    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)
            
        # Increment the count for this category
        self.incc(cat)
        
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
        
    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0
            # The total number of times this feature appeared in this
            # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)
    
    def cprob(self,f,cat):
        # The frequency of this feature in this category
        clf=self.fprob(f,cat)
        if clf==0: return 0
        
        # The frequency of this feature in all the categories
        freqsum=sum([self.fprob(f,c) for c in self.categories( )])
        
        # The probability is the frequency in this category divided by
        # the overall frequency
        p=clf/(freqsum)
        return p
        
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)
        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories( )])
        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp
    
    def fisherprob(self,item,cat):
        # Multiply all the probabilities together
        p=1
        features=self.getfeatures(item)
        for f in features:
            p*=(self.weightedprob(f,cat,self.cprob))
            
        # Take the natural log and multiply by -2
        fscore=-2*math.log(p)
        
        # Use the inverse chi2 function to get a probability
        return self.invchi2(fscore,len(features)*2)

    def invchi2(self,chi,df):
        m = chi / 2.0
        sum = term = math.exp(-m)
        for i in range(1, df//2):
            term *= m / i
            sum += term
        return min(sum, 1.0)

    def classify(self,item,default=None):
        # Loop through looking for the best result
        best=default
        max=0.0
        for c in self.categories( ):
            p=self.fisherprob(item,c)
            
            # Make sure it exceeds its minimum
            if p>self.getminimum(c) and p>max:
                best=c
                max=p
        return best

In [54]:
fl3 = fisherclassifier3(getwords)

In [55]:
sampletrain(fl3)

In [56]:
fl3.classify('quick rabbit')

'good'

In [57]:
fl3.classify('quick money')

'bad'

In [58]:
fl3.setminimum('bad',0.8)

In [59]:
fl3.classify('quick money')

'good'

In [60]:
fl3.setminimum('good',0.4)

In [61]:
fl3.classify('quick money')

'good'

### Persisting the Trained Classifiers, Using SQLite

In [62]:
from sqlite3 import dbapi2 as sqlite

In [63]:
def setdb(self,dbfile):
    self.con=sqlite.connect(dbfile)
    self.con.execute('create table if not exists fc(feature,category,count)')
    self.con.execute('create table if not exists cc(category,count)')

In [64]:
def incf(self,f,cat):
    count=self.fcount(f,cat)
    if count==0:
        self.con.execute("insert into fc values ('%s','%s',1)"% (f,cat))
    else:
        self.con.execute("update fc set count=%d where feature='%s' and category='%s'"% (count+1,f,cat))

def fcount(self,f,cat):
    res=self.con.execute('select count from fc where feature="%s" and category="%s"'%(f,cat)).fetchone( )
    if res==None: return 0
    else: return float(res[0])
    
def incc(self,cat):
    count=self.catcount(cat)
    if count==0:
        self.con.execute("insert into cc values ('%s',1)" % (cat))
    else:
        self.con.execute("update cc set count=%d where category='%s'" % (count+1,cat))
    
def catcount(self,cat):
    res=self.con.execute('select count from cc where category="%s"' %(cat)).fetchone( )
    if res==None: return 0
    else: return float(res[0])
    
def categories(self):
    cur=self.con.execute('select category from cc');
    return [d[0] for d in cur]

def totalcount(self):
    res=self.con.execute('select sum(count) from cc').fetchone( );
    if res==None: return 0
    return res[0]