# Text Classification Using SVM

## Import Libraries

In [22]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Import Dataset

In [23]:
from sklearn.datasets import fetch_20newsgroups

## Extract Training Data

In [24]:
trainData = fetch_20newsgroups(subset = "train", shuffle = True, remove = ("headers", "footers", "quotes"))

In [25]:
trainData.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [26]:
print("\n".join(trainData.data[0].split("\n")[:]))

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


## Extract Features

In [27]:
countVector = CountVectorizer()
xTrainCounts = countVector.fit_transform(trainData.data)

In [28]:
print("Shape: ", xTrainCounts.shape)
featureNames = countVector.get_feature_names()
print(len(featureNames))
# print(xTrainCounts.toarray())

Shape:  (11314, 101631)
101631


In [29]:
TF_IDF = TfidfTransformer()
xTrainTransform = TF_IDF.fit_transform(xTrainCounts)

In [30]:
print("Shape: ", xTrainTransform.shape)
# print(xTrainTransform.toarray())

Shape:  (11314, 101631)


## Create Pipeline

In [31]:
steps = [("CV", CountVectorizer(stop_words = "english")), ("TFID", TfidfTransformer()), ("Class", SGDClassifier())]

In [32]:
textClassifier = Pipeline(steps)

In [33]:
textClassifier = textClassifier.fit(trainData.data, trainData.target)

## Test Model

In [34]:
testData = fetch_20newsgroups(subset = "test", shuffle = True, remove = ("headers", "footers", "quotes"))

In [35]:
predict = textClassifier.predict(testData.data)

In [36]:
np.mean(predict == testData.target)

0.695432819968136

## Tune Model

In [37]:
parameters = {"CV__ngram_range": [(1, 1), (1, 2)], "TFID__use_idf": (True, False), "Class__alpha": (1e-3, 1e-2)}

In [38]:
gridSearch = GridSearchCV(textClassifier, parameters, n_jobs = -1)

In [39]:
gridSearch = gridSearch.fit(trainData.data, trainData.target)

In [40]:
gridSearch.best_score_

0.753315298229739

In [41]:
gridSearch.best_params_

{'CV__ngram_range': (1, 2), 'Class__alpha': 0.001, 'TFID__use_idf': True}