In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
# Pipeline
from sklearn.pipeline import Pipeline

# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# Metrics
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# from sklearn.datasets import fetch_20newsgroups

# train_bunch = fetch_20newsgroups(subset='train')
# print('train_bunch size :', len(train_bunch))

# test_bunch = fetch_20newsgroups(subset='train')
# print('test_bunch :', len(test_bunch))

In [None]:
# Ref.: http://scikit-learn.org/stable/datasets/index.html#loading-from-external-datasets
# scikit-learn’s datasets.
# load_files for directories of text files where the name of each directory is the name of each category and 
# each file inside of each directory corresponds to one sample from that category
from sklearn.datasets import load_files

# Note: load_files by default shuffles the dataset :)
train_bunch = load_files('data/20news-bydate-train')
test_bunch = load_files('data/20news-bydate-test')

print(len(train_bunch)) # 5
print(len(test_bunch)) # 5

In [None]:
train_bunch.keys() # dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
print(type(train_bunch.data)) # list
print(len(train_bunch.data)) # 11314
print(len(test_bunch.data)) # 7532

In [None]:
print('Description :',train_bunch.DESCR)
print("%d documents" % len(train_bunch.filenames)) # 11314 documents
print("%d categories" % len(train_bunch.target_names)) # 20 categories
print("\nCategories :")
for i,val in enumerate(train_bunch.target_names): # List of possible categories in dataset
    print(i, ' : ',val)

In [None]:
print(train_bunch.target[:20])

In [None]:
type(train_bunch.data)

In [None]:
type(train_bunch.data[0]) # bytes

In [None]:
# Bag of Words
cvec = CountVectorizer()
data = map(lambda d : d.decode('utf-8','ignore'),
          train_bunch.data)
xtrain_counts = cvec.fit_transform(data)
xtrain_counts.shape # (11314, 130093)

In [None]:
# Normalize occurance with Frequencies
tfidtrans = TfidfTransformer()
xtrain_tfid = tfidtrans.fit_transform(xtrain_counts)
xtrain_tfid.shape # (11314, 130093)

### Build Pipeline with Multinomial Naive Bayes Classifier

In [None]:
pclf = Pipeline([
    ('cvec',cvec),
    ('tfidtrans', tfidtrans),
    ('nbclf', MultinomialNB())
])

data = map(lambda d : d.decode('utf-8','ignore'),
          train_bunch.data)

pclf.fit(data, train_bunch.target)

In [None]:
testdata = map(lambda d : d.decode('utf-8','ignore'), 
               test_bunch.data)
preds = pclf.predict(testdata)

acc = np.mean(preds == test_bunch.target)
print('NB Prediction Accuracy :', acc) # NB Prediction Accuracy : 0.7738980350504514

creport = classification_report(test_bunch.target, preds, target_names=train_bunch.target_names)
print(creport)

print( confusion_matrix(test_bunch.target, preds) )
'''
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      0.74      0.82       396
               sci.space       0.84      0.89      0.87       394
  soc.religion.christian       0.44      0.98      0.61       398
      talk.politics.guns       0.64      0.94      0.76       364
   talk.politics.mideast       0.93      0.91      0.92       376
      talk.politics.misc       0.96      0.42      0.58       310
      talk.religion.misc       0.97      0.14      0.24       251

             avg / total       0.82      0.77      0.77      7532
'''

### Build Pipeline with SVM Classifier

In [None]:
svm_pclf = Pipeline([
    ('cvec',cvec),
    ('tfidtrans', tfidtrans),
    ('nbclf', SGDClassifier())
])

data = map(lambda d : d.decode('utf-8','ignore'),
          train_bunch.data)

svm_pclf.fit(data, train_bunch.target)

In [None]:
testdata = map(lambda d : d.decode('utf-8','ignore'), 
               test_bunch.data)
preds = svm_pclf.predict(testdata)

acc = np.mean(preds == test_bunch.target)
print('NB Prediction Accuracy :', acc) # NB Prediction Accuracy : 0.851035581518853

creport = classification_report(test_bunch.target, preds, target_names=train_bunch.target_names)
print(creport)

print( confusion_matrix(test_bunch.target, preds) )
'''
                          precision    recall  f1-score   support

             alt.atheism       0.82      0.77      0.79       319
           comp.graphics       0.76      0.79      0.77       389
 comp.os.ms-windows.misc       0.76      0.74      0.75       394
comp.sys.ibm.pc.hardware       0.73      0.74      0.74       392
   comp.sys.mac.hardware       0.83      0.86      0.84       385
          comp.windows.x       0.86      0.77      0.81       395
            misc.forsale       0.82      0.91      0.86       390
               rec.autos       0.92      0.89      0.90       396
         rec.motorcycles       0.94      0.95      0.95       398
      rec.sport.baseball       0.91      0.94      0.93       397
        rec.sport.hockey       0.96      0.97      0.97       399
               sci.crypt       0.93      0.95      0.94       396
         sci.electronics       0.85      0.76      0.80       393
                 sci.med       0.91      0.88      0.90       396
               sci.space       0.88      0.95      0.92       394
  soc.religion.christian       0.83      0.94      0.88       398
      talk.politics.guns       0.75      0.92      0.83       364
   talk.politics.mideast       0.97      0.91      0.94       376
      talk.politics.misc       0.87      0.61      0.72       310
      talk.religion.misc       0.70      0.61      0.65       251

             avg / total       0.85      0.85      0.85      7532
'''