In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import numpy as np
import tensorflow as tf

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from numpy import array
from numpy import argmax

In [5]:
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'rec.autos',
        'talk.politics.guns',
        'rec.sport.baseball'
    ]

remove = ()

print("Loading 20 newsgroups dataset for categories:")
print("\t\t" + str(categories))


data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

Loading 20 newsgroups dataset for categories:
		['alt.atheism', 'talk.religion.misc', 'rec.autos', 'talk.politics.guns', 'rec.sport.baseball']


In [6]:
print("train and test labes are being encoded to onehot vector")

target_names = data_train.target_names
y_train, y_test = data_train.target, data_test.target

print("integer classes")

print("\ttrain")
print(y_train)

print("\ttest")
print(y_test)


label_encoder_train = LabelEncoder()
onehot_encoder_train = OneHotEncoder(sparse=False)

integer_encoded_train = label_encoder_train.fit_transform(y_train)
integer_encoded_train = integer_encoded_train.reshape(len(integer_encoded_train), 1)

y_train_onehot = onehot_encoder_train.fit_transform(integer_encoded_train)


label_encoder_test = LabelEncoder()
onehot_encoder_test = OneHotEncoder(sparse=False)


integer_encoded_test = label_encoder_test.fit_transform(y_test)
integer_encoded_test = integer_encoded_test.reshape(len(integer_encoded_test), 1)

y_test_onehot = onehot_encoder_test.fit_transform(integer_encoded_test)


print("encoded clasesses")
print("\ttrain")
print(y_train_onehot)
print("\ttest")
print(y_test_onehot)

train and test labes are being encoded to onehot vector
integer classes
	train
[1 1 2 ..., 1 2 3]
	test
[4 4 2 ..., 1 3 3]
encoded clasesses
	train
[[ 0.  1.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 ..., 
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.]]
	test
[[ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.]
 ..., 
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  1.  0.]]


In [7]:
print("Extracting features from the training data using a sparse vectorizer")
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)

print("n_samples: %d, n_features: %d" % X_train.shape)

Extracting features from the training data using a sparse vectorizer
n_samples: 2594, n_features: 37364


In [8]:
print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(data_test.data)
print("n_samples: %d, n_features: %d" % X_test.shape)

Extracting features from the test data using the same vectorizer
n_samples: 1727, n_features: 37364


In [9]:
X_train = X_train.toarray()
X_test = X_test.toarray()

feature_flat_size = X_train[0].size
num_classes = len(target_names)

#print(str(X_train))
print("feature size train: " + str(X_train[0].size))
print("feature size test: " + str(X_test[0].size))

print("number of classes: " + str(num_classes))

feature size train: 37364
feature size test: 37364
number of classes: 5


In [10]:
x = tf.placeholder(tf.float32, [None, feature_flat_size])

y_true = tf.placeholder(tf.float32, [None, num_classes])

y_true_cls = tf.placeholder(tf.int64, [None])

weights = tf.Variable(tf.zeros([feature_flat_size, num_classes]))
biases = tf.Variable(tf.zeros([num_classes]))

In [11]:
#model
logits = tf.matmul(x, weights) + biases

In [12]:
y_pred = tf.nn.softmax(logits)
y_pred_cls = tf.argmax(y_pred, dimension=1)

Instructions for updating:
Use the `axis` argument instead


In [14]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_true)
cost = tf.reduce_mean(cross_entropy)

In [15]:

#Define Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(cost)


In [16]:
correct_prediction = tf.equal(y_pred_cls, y_true_cls)

In [17]:
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [18]:
session = tf.Session()
session.run(tf.global_variables_initializer())


In [19]:
def optimize(num_iterations):
    feed_dict_train = {x: X_train, y_true: y_train_onehot}
    
    for i in range(num_iterations):
        # Put the batch into a dict with the proper names
        # for placeholder variables in the TensorFlow graph.
        # Note that the placeholder for y_true_cls is not set
        # because it is not used during training.
        
        # Run the optimizer using this batch of training data.
        # TensorFlow assigns the variables in feed_dict_train
        # to the placeholder variables and then runs the optimizer.
        session.run(optimizer, feed_dict=feed_dict_train)

In [20]:
feed_dict_test = {x: X_test,
                  y_true: y_test_onehot,
                  y_true_cls: y_test}

In [22]:
def print_accuracy():
    # Use TensorFlow to compute the accuracy.
    acc = session.run(accuracy, feed_dict=feed_dict_test)
    
    # Print the accuracy.
    print("Accuracy on test-set: {0:.1%}".format(acc))

In [23]:
optimize(num_iterations=1000)

In [24]:
print_accuracy()

Accuracy on test-set: 87.2%


In [25]:
cls_pred = session.run(y_pred_cls, feed_dict=feed_dict_test)
print("true test labels")
print(cls_pred)
print("predicted labels")
print(y_test)

true test labels
[0 2 2 ..., 1 3 3]
predicted labels
[4 4 2 ..., 1 3 3]


In [26]:
score = metrics.accuracy_score(y_test, cls_pred)
print("acc:" + str(score))

acc:0.872032426173
