# briefly explore the dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_data = fetch_20newsgroups(shuffle=True, random_state=42)

In [3]:
type(twenty_data)

sklearn.datasets.base.Bunch

In [5]:
target_names = twenty_data.target_names

In [24]:
print target_names[0]

alt.atheism


In [6]:
all_categories = [twenty_data.target_names[t] for t in twenty_data.target]
for s in all_categories[:10]:
    print s

rec.autos
comp.sys.mac.hardware
comp.sys.mac.hardware
comp.graphics
sci.space
talk.politics.guns
sci.med
comp.sys.ibm.pc.hardware
comp.os.ms-windows.misc
comp.sys.mac.hardware


# for exportation

In [7]:
from sklearn.cross_validation import train_test_split

twenty_train, twenty_test, y_train, y_test = train_test_split(
    twenty_data.data, twenty_data.target, test_size=0.3)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(twenty_train)
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

In [16]:
from sklearn.naive_bayes import MultinomialNB
export_clf_nb = MultinomialNB().fit(train_tfidf, y_train)

In [19]:
from sklearn.linear_model import SGDClassifier
export_clf_svm = (SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, n_iter=5, random_state=42)
                            .fit(train_tfidf, y_train))

In [22]:
import pickle

with open('export.pickle', 'wb') as f:
    pickle.dump([count_vect, tfidf_transformer, target_names, [export_clf_nb, export_clf_svm]], f)

In [25]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted_new = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted_new):
    print('%r => %s' % (doc, target_names[category]))
    
##TypeError: only integer arrays with one element can be converted to an index

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [21]:
x_test = tfidf_transformer.transform(count_vect.transform(twenty_test))
for clf in [export_clf_nb, export_clf_svm]:
    print classifier_metrics(clf, x_test, y_test, target_names=target_names)

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.72      0.77       133
           comp.graphics       0.90      0.71      0.79       183
 comp.os.ms-windows.misc       0.84      0.89      0.86       172
comp.sys.ibm.pc.hardware       0.76      0.80      0.78       184
   comp.sys.mac.hardware       0.92      0.80      0.86       186
          comp.windows.x       0.94      0.81      0.87       197
            misc.forsale       0.93      0.62      0.74       194
               rec.autos       0.87      0.87      0.87       184
         rec.motorcycles       0.93      0.97      0.95       179
      rec.sport.baseball       0.96      0.93      0.94       185
        rec.sport.hockey       0.90      0.98      0.94       184
               sci.crypt       0.58      0.98      0.73       165
         sci.electronics       0.83      0.76      0.80       165
                 sci.med       0.97      0.89      0.93       166
         

# use pipeline

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf_test4 = text_clf.fit(twenty_train, y_train)

In [8]:
from sklearn import metrics

predicted_test4 = text_clf_test4.predict(twenty_test)
print(metrics.classification_report(y_true=y_test, y_pred=predicted_test4, target_names=twenty_data.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.99      0.71      0.83       200
         comp.graphics       0.99      0.87      0.93       249
               sci.med       0.92      0.93      0.93       215
soc.religion.christian       0.72      0.97      0.83       239

           avg / total       0.90      0.88      0.88       903



In [9]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf_svm', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-4, n_iter=5, random_state=42))
                        ])

text_clf_svm_test4 = text_clf_svm.fit(twenty_train, y_train)
predicted_svm_test4 = text_clf_svm_test4.predict(twenty_test)

print(metrics.classification_report(y_true=y_test, y_pred=predicted_svm_test4, target_names=twenty_data.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.99      0.95      0.97       200
         comp.graphics       0.93      0.97      0.95       249
               sci.med       0.96      0.93      0.94       215
soc.religion.christian       0.95      0.96      0.95       239

           avg / total       0.96      0.95      0.95       903



In [10]:
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from os import listdir
from os.path import isfile, join
import chardet

def load_split_data(datadir, test_size=0.4):

    for f in listdir(datadir):
        if isfile(join(datadir, f)):
            char_encoding = chardet.detect(f)["encoding"]
            break
        else:
            continue
    
    print "============================"
    print "read files from " + datadir
    print "file encoding is " + str(char_encoding)
    
    dataset = load_files(datadir, encoding=char_encoding, decode_error="ignore")
    
    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=test_size)
    
    return (x_train, x_test, y_train, y_test, dataset.target_names)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

def pipeline_train_classifier(classifier, x_train, y_train):
    if classifier == "nb":
        text_clf_nb = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB()),
                            ])
        return text_clf_nb.fit(x_train, y_train)
    elif classifier == "svm":
        text_clf_svm = Pipeline([('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf_svm', SGDClassifier(loss='hinge', penalty='l2',
                                                  alpha=1e-4, n_iter=5, random_state=42))
                                ])
        return text_clf_svm.fit(x_train, y_train)

In [12]:
from sklearn import metrics

def classifier_metrics(clf, x_test, y_test, target_names=None):
    predict = clf.predict(x_test)
    return metrics.classification_report(y_test, predict, target_names=target_names)

In [9]:
x_train, x_test, y_train, y_test, target_names = load_split_data(datadir="./twentydata/")

In [10]:
clf_nb = pipeline_train_classifier("nb", x_train, y_train)
clf_svm = pipeline_train_classifier("svm", x_train, y_train)

In [11]:
for clf in [clf_nb, clf_svm]:
    print classifier_metrics(clf, x_test, y_test, target_names=target_names)

                          precision    recall  f1-score   support

             alt.atheism       0.93      0.76      0.84       197
           comp.graphics       0.89      0.77      0.83       222
 comp.os.ms-windows.misc       0.88      0.75      0.81       237
comp.sys.ibm.pc.hardware       0.62      0.86      0.72       219
   comp.sys.mac.hardware       0.83      0.84      0.83       213
          comp.windows.x       0.90      0.88      0.89       228
            misc.forsale       0.95      0.69      0.80       226
               rec.autos       0.82      0.91      0.86       233
         rec.motorcycles       0.96      0.92      0.94       249
      rec.sport.baseball       0.96      0.94      0.95       250
        rec.sport.hockey       0.95      0.96      0.95       255
               sci.crypt       0.64      0.97      0.77       235
         sci.electronics       0.96      0.61      0.75       250
                 sci.med       0.96      0.94      0.95       230
         

# train on data without metadata

In [12]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

twenty_data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)

twenty_train, twenty_test, twentyY_train, twentyY_test = train_test_split(
    twenty_data.data, twenty_data.target, test_size=0.4)


text_clf_nb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf_nb = text_clf_nb.fit(twenty_train, twentyY_train)


text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf_svm', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-4, n_iter=5, random_state=42))
                        ])
text_clf_svm = text_clf_svm.fit(twenty_train, twentyY_train)

for clf in [text_clf_nb, text_clf_svm]:
    print classifier_metrics(clf, twenty_test, twentyY_test, target_names=target_names)

  'precision', 'predicted', average, warn_for)


                          precision    recall  f1-score   support

             alt.atheism       0.90      0.10      0.18       192
           comp.graphics       0.67      0.50      0.57       222
 comp.os.ms-windows.misc       0.71      0.58      0.63       226
comp.sys.ibm.pc.hardware       0.33      0.84      0.47       219
   comp.sys.mac.hardware       0.98      0.36      0.53       264
          comp.windows.x       0.85      0.75      0.79       248
            misc.forsale       0.85      0.56      0.68       238
               rec.autos       0.81      0.68      0.74       229
         rec.motorcycles       0.93      0.61      0.74       235
      rec.sport.baseball       0.94      0.76      0.84       235
        rec.sport.hockey       0.93      0.83      0.88       243
               sci.crypt       0.38      0.88      0.53       224
         sci.electronics       0.88      0.53      0.66       234
                 sci.med       0.88      0.65      0.75       238
         

In [13]:
import pickle

with open('clf.pickle', 'wb') as f:
    pickle.dump([clf_nb, clf_svm], f)

# grid search for svm

In [21]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.grid_search import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.001, 0.0001, 0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

data = fetch_20newsgroups(subset='train')
data_no_meta = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

def gridSearch(data, name):
    print("Performing grid search for " + name)
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

gridSearch(data, "data with meta")
print("=============")
gridSearch(data_no_meta, "data without meta")

Performing grid search for data with meta
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.001, 0.0001, 1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 18.6min finished


done in 1142.334s

Best score: 0.922
Best parameters set:
	clf__alpha: 0.0001
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)
Performing grid search for data without meta
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.001, 0.0001, 1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 13.0min finished


done in 802.951s

Best score: 0.752
Best parameters set:
	clf__alpha: 0.0001
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


Best score: 0.918
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'elasticnet'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)
        
Best score: 0.734
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)

# try xgboost

In [None]:
from sklearn.datasets import fetch_20newsgroups

# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_data = fetch_20newsgroups(shuffle=True, random_state=42)
data_no_meta = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [None]:
from sklearn.cross_validation import train_test_split

twenty_train, twenty_test, y_train, y_test = train_test_split(
    twenty_data.data, twenty_data.target, test_size=0.3)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(twenty_train)
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

x_test = tfidf_transformer.transform(count_vect.transform(twenty_test))

In [None]:
import xgboost as xgb

def data_process(data):
    twenty_train, twenty_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)
    count_vect = CountVectorizer()
    train_counts = count_vect.fit_transform(twenty_train)
    tfidf_transformer = TfidfTransformer()
    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    
    x_test = tfidf_transformer.transform(count_vect.transform(twenty_test))
    
    dtrain = xgb.DMatrix(train_tfidf, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)

    
    return dtrain, dtest, count_vect, train_tfidf

nometa_train, nometa_test, count_v, tfidf = data_process(data_no_meta)

param = {
    'num_class':20,
    'silent':0, 
    'objective':'multi:softprob',
    'seed':42
    }

param['max_depth'] = 5
param['nthread'] = 4
param['learning_rate'] = 0.3
param['n_estimators'] = 1000
param['min_child_weight'] = 2
param['subsample'] = 0.8
param['eta'] = 1
param['gamma'] = 0.5
param['colsample_bytree'] = 0.8

evallist  = [(nometa_test,'eval'), (nometa_train,'train')]
num_round = 50
plst = param.items()

bst_nometa = xgb.train( plst, nometa_train, num_round, evallist )

In [None]:
import pickle

with open('xgb_nometa', 'wb') as f:
    pickle.dump()