In [59]:
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
import pandas as pd

categories = [
        
        'talk.religion.misc',
        
        'sci.space'
    ]
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
                               
categories = data_train.target_names

y_train, y_test = data_train.target, data_test.target



#print(" with chi2  n_samples: %d, n_features: %d" % X_train.shape)

#feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]

#feature_names = np.asarray(feature_names)

#print(feature_names)

n_features = np.arange(50,1000,50)
def benchmark(clf):
    r = []
    for n in n_features:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english', max_features=n)
        t0 = time()
        
        X_train = vectorizer.fit_transform(data_train.data)
        ch2 = SelectKBest(chi2, k='all')
        X_train = ch2.fit_transform(X_train, y_train)
        
        clf.fit(X_train, y_train)
        train_time = time() - t0
        
        X_test = vectorizer.transform(data_test.data)

        #feature_names = vectorizer.get_feature_names()

        
        X_test = ch2.transform(X_test)
        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        score = metrics.accuracy_score(y_test, pred)
        clf_descr = str(clf).split('(')[0]
        r.append((n ,clf_descr, score, train_time, test_time))
    return r

results = []
    
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
for penalty in ["l2", "l1"]:

    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    
"""   

#print((results))

indices = np.arange(len(results))
#print(indices)

results = [[x[i] for x in results] for i in range(5)]


number, clf_names, score, training_time, test_time = results
#training_time = np.array(training_time) / np.max(training_time)
#test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, number, 50, label="number", color='r')
plt.barh(indices, score, .2, label="score", color='r')
#plt.barh(indices + .3, training_time, .2, label="training time", color='g')
#plt.barh(indices + .6, test_time, .2, label="test time", color='b')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()

"""



nfeatures_plot_tgt = pd.DataFrame([t for lst in results for t in lst], columns = ['nfeatures', 'model', 'accuracy', 'train_time', 'test_time'])
#print(nfeatures_plot_tgt.head(5))


"""
MNB = nfeatures_plot_tgt.loc[nfeatures_plot_tgt['model'] == 'MultinomialNB']
NB = nfeatures_plot_tgt.loc[nfeatures_plot_tgt['model'] == 'BernoulliNB']
#SVC = nfeatures_plot_tgt.loc[nfeatures_plot_tgt['model'] == 'LinearSVC'] 




plt.figure(figsize=(8,6))
plt.plot(MNB.nfeatures, MNB.accuracy,label='MNB ',color='royalblue')
plt.plot(NB.nfeatures, NB.accuracy,label='NB ',color='red')
#plt.plot(SVC.nfeatures, SVC.accuracy,label='SVC ',color='green')

plt.title("Multinomial and BernoulliNB")
plt.xlabel("Number of features")
plt.ylabel("Test set accuracy")
plt.legend()



"""







   nfeatures          model  accuracy  train_time  test_time
0         50  MultinomialNB  0.800000    0.633036      0.001
1        100  MultinomialNB  0.792248    0.531031      0.000
2        150  MultinomialNB  0.821705    0.513029      0.001
3        200  MultinomialNB  0.838760    0.678039      0.001
4        250  MultinomialNB  0.844961    0.587034      0.000


'\nMNB = nfeatures_plot_tgt.loc[nfeatures_plot_tgt[\'model\'] == \'MultinomialNB\']\nNB = nfeatures_plot_tgt.loc[nfeatures_plot_tgt[\'model\'] == \'BernoulliNB\']\n#SVC = nfeatures_plot_tgt.loc[nfeatures_plot_tgt[\'model\'] == \'LinearSVC\'] \n\n\n\n\nplt.figure(figsize=(8,6))\nplt.plot(MNB.nfeatures, MNB.accuracy,label=\'MNB \',color=\'royalblue\')\nplt.plot(NB.nfeatures, NB.accuracy,label=\'NB \',color=\'red\')\n#plt.plot(SVC.nfeatures, SVC.accuracy,label=\'SVC \',color=\'green\')\n\nplt.title("Multinomial and BernoulliNB")\nplt.xlabel("Number of features")\nplt.ylabel("Test set accuracy")\nplt.legend()\n\n\n\n'