# Text Classification Using LibSVM

## Import Libraries

In [1]:
import gc

from libsvm.svmutil import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Import Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups

## Extract Training Data

In [3]:
trainData = fetch_20newsgroups(subset = "train", shuffle = True, remove = ("headers", "footers", "quotes"))

### Y Values

In [4]:
trainData.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
len(trainData.target)

11314

### An X Value Example

In [6]:
"".join(trainData.data[0].split("\n"))

'I was wondering if anyone out there could enlighten me on this car I sawthe other day. It was a 2-door sports car, looked to be from the late 60s/early 70s. It was called a Bricklin. The doors were really small. In addition,the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, yearsof production, where this car is made, history, or whatever info youhave on this funky looking car, please e-mail.'

## Extract Features

In [7]:
countVector = CountVectorizer(stop_words = "english")
xTrainCounts = countVector.fit_transform(trainData.data)

In [8]:
print("Shape: ", xTrainCounts.shape)

Shape:  (11314, 101322)


In [9]:
TF_IDF = TfidfTransformer()
xTrainTransform = TF_IDF.fit_transform(xTrainCounts)

In [10]:
transformArray = xTrainTransform.toarray()

In [11]:
transformArray.shape

(11314, 101322)

## Train Model

In [12]:
problem = svm_problem(trainData.target, transformArray)

In [13]:
param = svm_parameter()

In [14]:
param.kernel_type = LINEAR
param.cross_validation = 1
param.nr_fold = 10

In [15]:
accuracy = svm_train(problem, param)

Cross Validation Accuracy = 74.8807%


In [16]:
print(accuracy)

74.88067880502032


In [17]:
param.cross_validation = 0

In [18]:
model = svm_train(problem, param)

In [19]:
del transformArray

In [20]:
gc.collect()

116

## Test Model

In [21]:
testData = fetch_20newsgroups(subset = "test", shuffle = True, remove = ("headers", "footers", "quotes"))

In [22]:
xTestCounts = countVector.fit_transform(testData.data)
print("Shape: ", xTestCounts.shape)

Shape:  (7532, 70709)


In [23]:
xTestTransform = TF_IDF.fit_transform(xTestCounts)
transformArray = xTestTransform.toarray()

In [24]:
transformArray.shape

(7532, 70709)

In [25]:
accuracy = svm_predict(y = testData.target, x = transformArray, m = model)

Accuracy = 5.97451% (450/7532) (classification)
