# EX. 15

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
train = fetch_20newsgroups(subset='train', shuffle=True)
print(train.target_names)
print('Train set size: %s ' % train.target.size)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Train set size: 11314 


In [3]:
test = fetch_20newsgroups(subset='test', shuffle=True)
print(test.target_names)
print('Test set size %s ' % test.target.size)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Test set size 7532 


In [4]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
pipe_clf = Pipeline([
    # ('vect')
    ('vect', stemmed_count_vect), 
    ('tfidf', TfidfTransformer()), 
    ('dtc', DecisionTreeClassifier())
])

In [8]:
parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False)
#     'dtc__max_depth': (20,40)
}

In [16]:
gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)

In [17]:
gs_clf = gs_clf.fit(train.data, train.target)
print("Training Accuracy: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 

Best score: 0.6499911613929644
Best param: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [20]:
predicted_gs =  gs_clf.predict(test.data)
print("Test Accuracy: %s" % np.mean(predicted_gs == test.target))

Test Accuracy: 0.573287307488051


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rand_clf = Pipeline([
    # ('vect')
    ('vect', stemmed_count_vect), 
    ('tfidf', TfidfTransformer()), 
    ('rfc', RandomForestClassifier())
])

In [17]:
rand_parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False)
#     'dtc__max_depth': (20,40)
}

In [18]:
rand_gs_clf = GridSearchCV(rand_clf, rand_parameters, n_jobs=-1)

In [19]:
rand_gs_clf = rand_gs_clf.fit(train.data, train.target)
print("Training Accuracy: %s" % rand_gs_clf.best_score_) 
print("Best param: %s" % rand_gs_clf.best_params_) 

Training Accuracy: 0.7014318543397561
Best param: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


In [20]:
predicted_rand_gs =  rand_gs_clf.predict(test.data)
print("Test Accuracy: %s" % np.mean(predicted_rand_gs == test.target))

Test Accuracy: 0.6321030270844398


In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
NB_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [27]:
NB_parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False), 
  'clf__alpha': np.linspace(0.5, 1.5, 6),
  'clf__fit_prior': [True, False],  
}

In [28]:
nb_gs_clf = GridSearchCV(NB_clf, NB_parameters, n_jobs=-1)

In [29]:
nb_gs_clf = nb_gs_clf.fit(train.data, train.target)
print("Training Accuracy: %s" % nb_gs_clf.best_score_) 
print("Best param: %s" % nb_gs_clf.best_params_) 

Training Accuracy: 0.8875729185080431
Best param: {'clf__alpha': 0.5, 'clf__fit_prior': False, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [30]:
predicted_nb_gs =  nb_gs_clf.predict(test.data)
print("Test Accuracy: %s" % np.mean(predicted_nb_gs == test.target))

Test Accuracy: 0.8183749336165693


In [5]:
from sklearn.linear_model import SGDClassifier

In [6]:
SGD_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42,
                     max_iter=5, tol=None)),
])

In [7]:
SGD_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [8]:
SGD_gs_clf = GridSearchCV(SGD_clf, SGD_parameters, n_jobs=-1)

In [9]:
SGD_gs_clf = SGD_gs_clf.fit(train.data, train.target)
print("Training Accuracy: %s" % SGD_gs_clf.best_score_) 
print("Best param: %s" % SGD_gs_clf.best_params_) 

Training Accuracy: 0.8932296270107831
Best param: {'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [10]:
predicted_SGD_gs =  SGD_gs_clf.predict(test.data)
print("Test Accuracy: %s" % np.mean(predicted_SGD_gs == test.target))

Test Accuracy: 0.8299256505576208
