In [20]:
from KaggleWord2VecUtility import KaggleWord2VecUtility
import numpy as np
from sklearn.metrics import confusion_matrix

In [2]:
from gensim.models import Word2Vec
model = Word2Vec.load("800features_50minwords_10context")

In [3]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] / 5
# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  130.872732878 seconds.


In [4]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.index2word, idx ))

In [5]:
# For the first 10 clusters
for cluster in xrange(0,10):
    #
    # Print the cluster number  
    print "\nCluster %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words


Cluster 0
[u'dave']

Cluster 1
[u'continuously', u'blank', u'shine', u'priority']

Cluster 2
[u'suit', u'perkins', u'employer', u'pao', u'kleiner', u'attend']

Cluster 3
[u'climb', u'considerable', u'upward', u'multiples', u'fundamentals', u'prospects']

Cluster 4
[u'therefore', u'fail', u'extent', u'argue', u'besides', u'ignore', u'crack', u'lie', u'lose', u'needle', u'otherwise', u'poorly', u'chances']

Cluster 5
[u'ambient', u'warm', u'intensity']

Cluster 6
[u'output', u'exports', u'imports']

Cluster 7
[u'nation', u'incentives', u'rules', u'residents', u'interests', u'entities', u'forcing', u'principles', u'tackle', u'privately', u'standards', u'groups', u'communities', u'workers', u'agreements', u'perks', u'branch']

Cluster 8
[u'axp', u'crm', u'sdrl', u'aig', u'mgm', u'gpro', u'wfm', u'gild']

Cluster 9
[u'concepts', u'logical', u'extensive', u'usability', u'simplicity']


In [6]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [7]:
import pandas as pd

# Read data from files 
article = pd.read_csv( "train_trend_1.csv")
article_test = pd.read_csv( "test_trend_1.csv")

In [10]:
print "Parsing train reviews..."

opinions = []
for opinion in article['Articles']:
    opinions.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion )))

Parsing train reviews...


In [11]:
print "Parsing test reviews..."

opinions_test = []
for opinion_test in article_test['Articles']:
    opinions_test.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion_test )))

Parsing test reviews...


In [16]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (article['Articles'].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in opinions:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( article_test['Articles'].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in opinions_test:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1


In [17]:
# Fit a random forest and extract predictions
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print "Fitting a random forest to labeled testing data..."
forest = forest.fit(train_centroids,article["trend"])
result_test = forest.predict(test_centroids)

result_train = forest.predict(train_centroids)

# Write the test results 
#output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
#output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled testing data...


In [18]:
testing_y = pd.read_csv("y_trend_1.csv")
training_y = pd.read_csv("train_trend_1.csv")

In [21]:
cm_testing = confusion_matrix(testing_y,result_test)
print(cm_testing)
accuracy_testing = (cm_testing[0,0]+cm_testing[1,1])/float(sum(sum(cm_testing)))
print accuracy_testing 

[[945 812]
 [907 803]]
0.504182290164


In [22]:
cm_training = confusion_matrix(training_y['trend'], result_train)
print(cm_training)
accuracy_training = (cm_training[0,0]+cm_training[1,1])/float(sum(sum(cm_training)))
print accuracy_training

[[4067    6]
 [   5 4012]]
0.998640296663
