In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
import numpy as np

In [16]:
categories = ['rec.motorcycles', 'rec.sport.baseball',
                'comp.graphics', 'sci.space',
                'talk.politics.mideast']
remove = ("headers", "footers", "quotes")
ng5_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
ng5_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

## Exploring Data

In [4]:
print("\n".join(ng5_train.data[0].split("\n")))


I'd be willing to make two wagers:
1) Snow doesn't win ROY.
2) Mattingly is out of baseball within five years.

I'm skeptical of the first, because I don't think Snow is that good a
player, and he is on a losing team.

I'm skeptical of the second because of his back.  Mattingly is 32 this
year, and how many players play until they are 40?  Not too many, and
most of them didn't have chronic back problems when they were 32.

Could be wrong on either or both, but I think that's the smart way to
bet...


In [5]:
print(ng5_train.target_names[ng5_train.target[0]])

rec.sport.baseball


## Third step : Select n samples from non labeled data U

In [6]:
## Create a pipeline to make it simpler
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
])

# Start the loop process

In [26]:
from sklearn.utils import Bunch

length = len(ng5_train.data)
seed = Bunch(data=ng5_train.data[0:1500], target=ng5_train.target[0:1500])
unlabeled = Bunch(data=ng5_train.data[1500:length-1], target=ng5_train.target[1500:length-1])

In [27]:
# number of samples
n = 100
accuracy = []

while unlabeled.data:
     text_clf.fit(seed.data, seed.target)

     # predicted classes and corresponding probabilities
     predicted = text_clf.predict(unlabeled.data)
     predicted_proba = text_clf.predict_proba(unlabeled.data)
     accuracy.append(np.mean(predicted == unlabeled.target))

     ## Least Confidence (aka. Uncertainty) Strategy
     uncertainty = 1 - predicted_proba.max(axis=1)
     #uncertainty.size

     if len(unlabeled.data) > n : 
          # index of top n uncertainty score
          ind = np.argpartition(uncertainty, -n)[-n:]
          #uncertainty[ind]

          seed.data = seed.data + [unlabeled.data[i] for i in ind]
          seed.target = np.append(seed.target,unlabeled.target[ind])
          unlabeled.data = [unlabeled.data[i] for i in range(len(unlabeled.data)) if i not in ind]
          #unlabeled.target = [unlabeled.target[i] for i in range(len(unlabeled.target)) if i not in ind]
          unlabeled.target = np.delete(unlabeled.target, ind)
     else :
          seed.data = seed.data + unlabeled.data
          seed.target = np.append(seed.target, unlabeled.target)
          text_clf.fit(seed.data, seed.target)
          break

In [28]:
accuracy

[0.8494773519163763,
 0.899625468164794,
 0.9392712550607287,
 0.9709251101321585,
 0.9835748792270531,
 0.9946524064171123,
 0.9976047904191617,
 0.9972789115646259,
 0.9984251968503937,
 0.9981308411214953,
 0.9977011494252873,
 1.0,
 1.0,
 1.0,
 1.0]