In [97]:
import numpy as np
import pandas as pd
from scipy import spatial
from sklearn import cluster, preprocessing as pre
import sys
import re

In [98]:
DATA_SET = {
    "amazon": "data/amazon_cells_labelled.txt",
    "imdb": "data/imdb_labelled.txt",
    "yelp": "data/yelp_labelled.txt"
}

In [99]:
# A. Parse data sets
amazon = pd.read_csv(DATA_SET['amazon'], sep="\t", header=None, names=['Sentence', 'Label']).dropna()
imdb = pd.read_csv(DATA_SET['imdb'], sep="\t(?=[01])", header=None, names=['Sentence', 'Label'], engine='python').dropna()
yelp = pd.read_csv(DATA_SET['yelp'], sep="\t", header=None, names=['Sentence', 'Label']).dropna()

parsed_data = pd.concat([amazon, imdb, yelp], ignore_index=True) # 3000 sentences and labels

In [100]:
def get_label_ratio (data_set):
    print "LABEL 0: ", sum(data_set['Label'] == 0)
    print "LABEL 1: ", sum(data_set['Label'] == 1)
    return

In [101]:
# Get counts of each label
get_label_ratio(parsed_data)

LABEL 0:  1500
LABEL 1:  1500


In [102]:
# B. Preprocessing
def preprocess (data):    
    stopwords = set(["the", "and", "or", "a"]) # add some more later!
    for i in range(len(data)):
        data[i] = re.sub("[^a-zA-Z]", " ", data[i])
        data[i] = data[i].lower().strip()
        temp = [word for word in data[i].split() if word not in stopwords]
        data[i] = " ".join(temp)
    return

In [103]:
# Preprocess the data
preprocess(parsed_data['Sentence'].values)

In [452]:
# C. Split training and testing data
def split_data (data):
    zeros = data.loc[data['Label'] == 0]
    ones = data.loc[data['Label'] == 1]
    train, test = pd.concat([zeros[:400], ones[:400]]), pd.concat([zeros[400:], ones[400:]])
    return (train, test)

In [453]:
train, test = {}, {}
for k, data in parsed_data.items():
    train[k], test[k] = split_data(data)

In [454]:
# D. Bag of words
def get_bag (data):
    bag = []
    for sentence in data:
        for word in sentence.split():
            bag.append(word)
    return np.unique(bag)

In [455]:
BAGS = dict(zip(parsed_data.keys(), [get_bag(data['Sentence']) for k, data in train.items()]))

In [456]:
def get_feature_vector (bag, data):
    features = []
    for sentence in data:
        f = []
        s = sentence.split()
        for word in bag:
            f.append(s.count(word))
        features.append(f)
    return features

In [457]:
features = {}
LABELS = {}
for k in BAGS:
    features[k] = get_feature_vector (BAGS[k], parsed_data[k]['Sentence'].values)
    LABELS[k] = parsed_data[k]['Label'].values

In [458]:
# E. Postprocessing
def post_process (data):
    norm = []
    for features in data:
        mapped = map(float, features)
        norm.append(pre.normalize(mapped, norm='l2').flatten())
    return norm

In [459]:
# e.g. access POST_PROCESSED['amazon'] or POST_PROCESSED['imdb']
POST_PROCESSED = dict(zip(parsed_data.keys(), [post_process(features[k]) for k in parsed_data.keys()]))

In [461]:
class KMeans:
#     def kmeans(self, X, k_clusters = 2, max_iterations=1000):
    def kmeans(self, X, k_clusters, max_iterations):
        centroids = self.get_initial_clusters(X, k_clusters)
        old_centroids = []
        
        iteration = 0
        while not np.array_equal(centroids, old_centroids) and iteration < max_iterations:
            old_centroids = centroids[:] # must copy list, not assign
            clusters = [[] for i in range(k_clusters)]
            # cluster points to nearest centroid
            for x in X:
                min_dist = sys.maxint
                kth_idx = -1
                for idx, centroid in enumerate(centroids):
                    dist = spatial.distance.euclidean(x, centroid)
                    if dist < min_dist:
                        min_dist = dist
                        kth_idx = idx
                clusters[kth_idx].append(x)
            # update centroid
#             for idx, centroid in enumerate(centroids):
#                 centroids[idx] = np.array(clusters[idx]).sum(axis=0) / len(clusters[idx])
            for idx, cluster in enumerate(clusters):
                if cluster == []:
                    raise Exception('Empty cluster, try different centroid initialization')
                centroids[idx] = np.array(cluster).sum(axis=0) / float(len(cluster))
            iteration += 1
            
        return centroids, clusters
    # method 1
    # sometimes bad initial clusters cause empty clusters when euclidean distance is taken
    def get_initial_clusters(self, X, k_clusters):
        centroids = []
        for i in range(0, k_clusters):
            centroids.append(X[np.random.randint(0, len(X), size=1)])
        return centroids
    # method 2
    def get_initial_clusters2(self, X, k_clusters):
        centroids = []
        start_idx, end_idx = 0, len(X) / k_clusters
        for i in range(0, k_clusters):
            centroids.append(np.array(X[start_idx : end_idx]).sum(axis=0) / float(len(X) / k_clusters))
            start_idx = end_idx
            end_idx += end_idx
        return centroids
    
    def get_neighbors(self, X):
        neighbors = []
        for idx1, point1 in enumerate(X): # for each matrix of pixels
            distances = []
            [distances.append((spatial.distance.euclidean(point1, point2), idx2)) for idx2, point2 in enumerate(X)]
            distances.sort(key=itemgetter(0)) # sort list of tuples based on key 0, or distance!
            nearest_neighbors = distances[1:self.k_neighbors+1] # remove 0 distance while comparing the same value
            neighbors.append(nearest_neighbors)
        return neighbors
    
    def classifier(self, nearest_neighbors, digit_labels):
        possible_classes = []
        for neighbor in nearest_neighbors:
            possible_classes.append(digit_labels[neighbor[1]])

        return max(set(possible_classes), key=possible_classes.count)

    def predict(self, test_data):
        predicted_digits = []
        for test_instance in test_data:
            neighbors = self.get_neighbors(self.k_neighbors, self.train_data, test_instance)
            predicted_digits.append(self.classifier(neighbors, self.labels))
        return predicted_digits

In [463]:
X = [[1,1,1,1,1], [2,2,2,2,2], [3,3,3,3,3], [4,4,4,4,4], [10,10,10,10,10], [1,534,3,64,1], [2343,2,245,2,5], [0,100,0,100,3], [4,4,4,4,4], [10,10,10,10,10]]
centroids, clusters = KMeans().kmeans(X, 2, 1000000)

k_means = cluster.KMeans(n_clusters=2)
k_means.fit(X) 
values = k_means.cluster_centers_
labels = k_means.labels_

In [465]:
print np.sort(np.array(centroids))
print np.sort(values)
# print clusters[0]

[[   4.66666667   15.11111111   15.11111111   31.          264.11111111]
 [   1.            1.            3.           64.          534.        ]]
[[  3.88888889e+00   4.11111111e+00   4.22222222e+00   2.20000000e+01
    7.42222222e+01]
 [  2.00000000e+00   2.00000000e+00   5.00000000e+00   2.45000000e+02
    2.34300000e+03]]


In [466]:
centroids2, clusters2 = KMeans().kmeans(POST_PROCESSED['amazon'][:10], 2, 50000)

k_means = cluster.KMeans(n_clusters=2)
k_means.fit(POST_PROCESSED['amazon'][:10]) 
values2 = k_means.cluster_centers_
labels2 = k_means.labels_

In [468]:
print np.sort(np.array(centroids[0]))
print np.sort(values2[0])

[   4.66666667   15.11111111   15.11111111   31.          264.11111111]
[ 0.          0.          0.         ...,  0.12993231  0.19468914
  0.40699761]
