In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_recall_fscore_support

# Load data

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [4]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [8]:
vectors.shape

(2034, 34118)

In [9]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [10]:
vectors_test = vectorizer.transform(newsgroups_test.data)

# Naive Bayes

In [11]:
clf = MultinomialNB(alpha=.01)

In [12]:
clf.fit(vectors, newsgroups_train.target)

MultinomialNB(alpha=0.01)

In [13]:
pred_nb = clf.predict(vectors_test)

In [14]:
precision_recall_fscore_support(pred_nb, newsgroups_test.target, average='weighted')

(0.8969152610000587, 0.893569844789357, 0.8946148597135886, None)

# SVM

In [15]:
clf_svm = svm.SVC(kernel='linear')

In [16]:
clf_svm.fit(vectors, newsgroups_train.target)

SVC(kernel='linear')

In [17]:
pred_svm = clf_svm.predict(vectors_test)

In [18]:
precision_recall_fscore_support(pred_svm, newsgroups_test.target, average='weighted')

(0.8923930338303089, 0.8891352549889135, 0.8903566026007271, None)

# Decision Tree

In [19]:
clf_dt = DecisionTreeClassifier(criterion="entropy", random_state=0)

In [20]:
clf_dt.fit(vectors, newsgroups_train.target)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [21]:
pred_dt = clf_dt.predict(vectors_test)

In [22]:
precision_recall_fscore_support(pred_dt, newsgroups_test.target, average='weighted')

(0.6663319483751166, 0.6681448632668144, 0.6671833415344087, None)

# Random Forest

In [27]:
clf_rf = RandomForestClassifier(random_state=0, criterion="entropy", bootstrap=False)

In [28]:
clf_rf.fit(vectors, newsgroups_train.target)

RandomForestClassifier(bootstrap=False, criterion='entropy', random_state=0)

In [29]:
pred_rf = clf_rf.predict(vectors_test)

In [30]:
precision_recall_fscore_support(pred_rf, newsgroups_test.target, average='weighted')

(0.8507348197321034, 0.835920177383592, 0.8397643848414655, None)