# Text Classification

# loading data

In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


# statistics about the loaded data

In [4]:
print twenty_train.target_names
print len(twenty_train.data)
print len(twenty_train.filenames)

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
2257
2257


In [5]:
# first lines of the first loaded file

In [8]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [9]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [10]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [11]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


# representing text files as a matrix

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [14]:
# see the matrix dimension

In [15]:
X_train_counts.shape

(2257, 35788)

In [17]:
count_vect.vocabulary_.get(u'hampton')

16082

# Training a classifier

In [19]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_counts, twenty_train.target)

In [20]:
print clf.score(X_train_counts, twenty_train.target)

0.996455471865


# Testing the classifier

In [21]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [23]:
X_test_counts = count_vect.transform(twenty_test.data)
print clf.score(X_test_counts, twenty_test.target)

0.934087882823


# getting detailed performance report

In [25]:
from sklearn import metrics
predicted = clf.predict(X_test_counts)
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.92      0.90      0.91       319
         comp.graphics       0.95      0.95      0.95       389
               sci.med       0.96      0.91      0.93       396
soc.religion.christian       0.91      0.97      0.94       398

           avg / total       0.93      0.93      0.93      1502

