In [1]:
import urllib
import cv2
import numpy as np
import hashlib
import io
import math
import os
import re 
import random as rand
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import cPickle

def get_files(path):
    """ Return a list of file names in this directory that end in .jpg 
    The list should be sorted alphabetically by file name.
    Params:
        path....a directory containing .txt review files.
    Returns:
        a list of .txt file names, sorted alphabetically.
    """
    result = sorted([f for f in os.listdir(path) if f.endswith(".jpg")])
    result = [path + os.sep + f for f in result]
    return result


In [2]:
#find all urls in a text file
def findurl(filename):
    string = io.open(filename, encoding='utf8').readlines()
    return [item.strip().split()[1] for item in string]

#Retrieve photos in a text file.
#Because of large amount of dataset. We randomly retrieve at most 200 photos
#in a set.
def retrieveImg(path, urls, maxNum=200):
    if len(urls) < maxNum:
        maxNum = len(urls)
    sample = rand.sample(urls, maxNum)
    for i in range(0, len(sample)):
        urllib.urlretrieve(sample[i], path + os.sep + "%s.jpg" % str(i))

#Find RNPs.
def findLabel(filenames):
    return [os.path.splitext(name)[0].split(os.sep)[-1] \
            for name in filenames]

#Download images.
def download(filenames, ANPs, start, end):
    for i in range(start, end):
        path = "data"+os.sep+ANPs[i]
        if not os.path.exists(path):
            os.makedirs(path)
        urls = findurl(filenames[i])
        retrieveImg(path, urls)
        print ANPs[i]+" finished."

#path = "URL\URL1553"
#all_files = get_files(path)
#ANPs = findLabel(all_files) 
#download(all_files, ANPs, 190, 200)

In [3]:
#This is to extrct surf descriptor for a given set.
def featureExtract(filenames, hessianT=500):
    features = []
    noneFeatures = []
    for name in filenames:
        if os.path.getsize(name) > 3000:
            img = cv2.imread(name)
            surf = cv2.SURF(hessianT)
            if len(img.shape) > 2:
                gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
            kp, des = surf.detectAndCompute(gray,None)
            if des is not None:
                features.append(des)
            else:
                noneFeatures.append(name)
                os.remove(name)
    return np.vstack(tuple(features)), features, noneFeatures

In [18]:
#This is quantization of feature vectors given a list of features in a set of images.
#The same code from BagOfVisualWords.py
def quantization(features, codebook, t=0.7):
    '''
    Input
    features --- a list of surf descriptors for each image in dataset
    codebook --- visual vocabulary
    t --- a threshold control if a match should be included.
    Output
    an ndarray matrix. Each row represents an image. 
                        Each column is a visual word.
                        An entry in the matrix is the frequency of such visual words in an image.
    '''
    FLANN_INDEX_KDTREE = 0
    k=0
    index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
    flann = cv2.FlannBasedMatcher(index_params, {})
    result = np.zeros((len(features), len(codebook)), dtype=int)
    
    for feature in features:
        matches = flann.knnMatch(feature, codebook, k=2)
        for i,(m,n) in enumerate(matches):
            if m.distance < t*n.distance:
                result[k, m.trainIdx] += 1
        k += 1
        
    return result

In [4]:
#This is to perform kmeans clustering on surf features.
def kmeans(data, k, maxIteration=10, accuracy=1.0, attempts=10):
    '''
    learning visual vocabulary for each ANP
    '''
    criteria = (cv2.TERM_CRITERIA_EPS + \
                cv2.TERM_CRITERIA_MAX_ITER, maxIteration, accuracy)
    flags = cv2.KMEANS_RANDOM_CENTERS
    compactness,labels,centers = cv2.kmeans(data, k, criteria, attempts, flags)
    return compactness,labels,centers

def findK(data):
    '''
    This is to estimate a possible # of clusters in a dataset.
    Also the size of the codebook.
    Initial k.
    Rule of thumb estimation.
    https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set
    '''
    return int((data.shape[0] / 2.0) ** 0.5+1)

In [5]:
def repeatable_random(seed):
    hash = str(seed)
    while True:
        hash = hashlib.md5(hash).digest()
        for c in hash:
            yield ord(c)

def repeatable_shuffle(X, y):
    r = repeatable_random(42) 
    indices = sorted(range(X.shape[0]), key=lambda x: next(r))
    return X[indices], y[indices]

In [6]:
def select_training_image(path,name):
    result = []
    result.extend(get_files(path + os.sep + name))
    r = repeatable_random(42)
    other_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d)) and d != name]
    iters = len(result)
    for i in range(0, iters):
        j = rand.randint(0,len(other_dirs)-1)
        images = get_files(path+os.sep+other_dirs[j])
        #print images
        result.append(images[rand.randint(0,len(images)-1)])
    return result

In [7]:
def get_true_labels(path, name):
    """Return a *numpy array* of ints for the true sentiment labels of each file.
    1 means positive, 0 means negative. Use the name of the file to determine
    the true label.
    Params:
        path....file path
        filename....the class that hope to train
    Returns:
        a numpy array of 1 or 0 values corresponding to each element
        of file_names, where 1 indicates a positive review, and 0
        indicates a negative review.
    """
    length = len(get_files(path+os.sep+name))         
    return np.array([1] * (length) + [0] * (length))

In [8]:
def do_cross_validation(X, y, n_folds=5, verbose=False):
    """
    Perform n-fold cross validation, calling get_clf() to train n
    different classifiers. Use sklearn's KFold class: http://goo.gl/wmyFhi
    Be sure not to shuffle the data, otherwise your output will differ.
    Params:
        X.........a csr_matrix of feature vectors
        y.........the true labels of each document
        n_folds...the number of folds of cross-validation to do
        verbose...If true, report the testing accuracy for each fold.
    Return:
        the average testing accuracy across all folds.
    """
    cv = KFold(len(y), n_folds)
    accuracies = []
    counter = 0
    for train_idx, test_idx in cv:
        clf = LogisticRegression()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        if verbose:
            print "fold %d accuracy = %.4f" %(counter, acc)
        counter+=1
        accuracies.append(acc)
    avg = np.mean(accuracies)
    return avg

In [9]:
def build_classifiers(path,filenames,codebook):
    result = []
    for name in filenames:
        print name
        training_sets = select_training_image(path,name)
        features = featureExtract(training_sets)
        X = quantization(features[1],codebook,t=1)
        y = get_true_labels(path, name)
        X, y = repeatable_shuffle(X,y)
        clf = LogisticRegression()
        clf.fit(X,y)
        result.append(clf)
    return result

In [10]:
def build_codebook(size,filenames):
    iters = size
    result = []
    for i in range(0, iters):
        j = rand.randint(0,len(filenames)-1)
        images = get_files(path+os.sep+filenames[j])
        #print images
        result.append(images[rand.randint(0,len(images)-1)])
    features = featureExtract(result)
    codebook = kmeans(features[0], findK(features[0]))
    return codebook[2]

In [11]:
def predict_class(classifers,path,filenames,codebook,n_eachfile=10):
    probs = []
    images = []
    for i in range(0,len(classifiers)):
        images.extend(get_files(path+os.sep+filenames[i])[:n_eachfile])
    features = featureExtract(images)
    #print len(features[1])
    X = quantization(features[1],codebook,t=1)
    #print X.shape
    for i in range(0,len(classifiers)):
        probs.append(classifiers[i].predict_proba(X))
    prediction = []
    for i in range(0,len(filenames)):
        for j in range(0,n_eachfile):
            prediction.append({"files path":images[i*n_eachfile+j], 
                               "true label":filenames[i],
                               "predict":filenames[np.argmax([x[i*n_eachfile+j][1] for x in probs])]})
    return prediction

In [12]:
def compute_accuracy(prediction,n_eachfile):
    for i in range(0,len(prediction)/n_eachfile):
        counter = 0
        for j in range(0,n_eachfile):
            if prediction[i*n_eachfile+j]['true label'] == prediction[i*n_eachfile+j]['predict']:
                counter += 1
        print "The accuracy of %s is %f" %(prediction[i*n_eachfile]['true label'], 1.*counter/n_eachfile)
    return 1.*len([item for item in prediction if item['true label'] == item['predict']])/len(prediction)

In [13]:
def removeEmpty(filenames):
    for filename in filenames:
        files = get_files(path + os.sep + filename)
        for item in files:
            if os.path.getsize(item) <= 3000:
                os.remove(item)               

In [14]:
path = "data"
filenames = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
with open('labels.pkl', 'wb') as fid:
    cPickle.dump(filenames, fid)

In [None]:
#codebook = build_codebook(1250,filenames)

In [21]:
with open('my_dumped_codebook.pkl', 'rb') as fid:
    codebook_1 = cPickle.load(fid)
classifiers = build_classifiers(path,filenames[180:200],codebook_1)

candid_teen
charming_city
charming_house
charming_lady
charming_places
charming_smile
charming_street
cheerful_face
cheerful_flowers
cheerful_smile
christian_artist
christian_band
christian_bible
christian_book
christian_church
christian_concert
christian_cross
christian_faith
christian_festival
christian_heritage


In [23]:
# save the classifier
with open('classifier181_200.pkl', 'wb') as fid:
    cPickle.dump(classifiers, fid)
#with open("codebook.pkl", 'wb') as fid1:
    #cPickle.dump(codebook,fid1)

In [35]:
with open('classifier101_120.pkl', 'rb') as fid:
    classifiers_1 = cPickle.load(fid)
len(classifiers_1)

20

In [None]:
prediction =  predict_class(classifiers_1,path,filenames[70:80],codebook_1, 10)

In [None]:
prediction

In [None]:
print "the average accuracy is %f" %(compute_accuracy(prediction,20))

In [None]:
filename = "adorable_puppy"
training_sets = select_training_image("data",filename)
features = featureExtract(training_sets)

In [None]:
#codebook = clustering(200,features[0])[1]
X = quantization(features[1], codebook_1,t=1)
y = get_true_labels("data",filename)

In [None]:
X, y = repeatable_shuffle(X, y)

In [None]:
print('average cross validation accuracy=%.4f' %
      do_cross_validation(X, y,n_folds=5, verbose=False))