In [1]:
# SAMPLE CODE FROM TOWARDS DATA SCIENCE
# USING SVM FOR CLASSIFYING DOCUMENTS
# DATA FROM http://qwone.com/~jason/20Newsgroups/

import numpy as np

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
# OBTAIN

# TRAIN DATA
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

# TEST DATA
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [3]:
# EXPLORE
# CHECK TO SEE WHAT AN ENTRY LOOKS LIKE
# THIS CODE IS 20 LINES FROM ENTRY 5

print("\n".join(twenty_train.data[5].split("\n")[:20])) 

From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)
Subject: Re: Rewording the Second Amendment (ideas)
Organization: VTT
Lines: 58

In article <1r1eu1$4t@transfer.stratus.com> cdt@sw.stratus.com (C. D. Tavares) writes:
>In article <1993Apr20.083057.16899@ousrvr.oulu.fi>, dfo@vttoulu.tko.vtt.fi (Foxvog Douglas) writes:
>> In article <1qv87v$4j3@transfer.stratus.com> cdt@sw.stratus.com (C. D. Tavares) writes:
>> >In article <C5n3GI.F8F@ulowell.ulowell.edu>, jrutledg@cs.ulowell.edu (John Lawrence Rutledge) writes:
>
>> >> The massive destructive power of many modern weapons, makes the
>> >> cost of an accidental or crimial usage of these weapons to great.
>> >> The weapons of mass destruction need to be in the control of
>> >> the government only.  Individual access would result in the
>> >> needless deaths of millions.  This makes the right of the people
>> >> to keep and bear many modern weapons non-existant.

>> >Thanks for stating where you're coming from.  Needless to say, I
>> >disagree 

In [4]:
# EXPLORE
# SHOW THE CURRENT TARGET NAMES

twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
# EXPLORE
# EXTRACT FEATURES - DOCUMENT TERM MATRIX

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [6]:
# EXPLORE
# EXTRACT FEATURES - INVERSE DOCUMENT FREQUENCY

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [7]:
# MODEL
# ALTERNATE METHOD FOR FREQUENCY DOCUMENTS
# BUILD A PIPELINE
# WILL DO THE COUNT AND INVERSE WITH CLASSIFICATION METHOD IN ONE CODE BLOCK

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                       alpha=1e-3, n_iter=5, random_state=42)),
])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)




In [8]:
# MODEL 
# COMPARE SVM CLASSIFIER AGAINST TEST SET

predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.82381837493361654

In [9]:
# MODEL
# USE GRID SEARCH TO FIND BEST PARAMETERS FOR SVM CLASSIFIER

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf-svm__alpha': (1e-2, 1e-3),
}

gs_clf_svm = GridSearchCV(text_clf, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [10]:
# MODEL
# USE BEST PARMETERS FOR COMPARING ON TEST SET

predicted = gs_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.83311205523101439