# CLASSIFYING NEWSGROUP TOPICS WITH SUPPORT VECTOR MACHINES

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

In [2]:
# NLTK- natural language toolkit

all_names=set(names.words())
#all_names

lemmatizer= WordNetLemmatizer()

# preprocessing
#- lower case
#- root form run, running- run (lemmatization)
# stopwords- a, the, an for



In [10]:
def is_letter_only(word):
    return word.isalpha()

from nltk.corpus import stopwords
stop_words= stopwords.words('english')

In [3]:
#we are defining a function for cleaning data
def clean_text(docs):
    docs_cleaned=[]
    for doc in docs:
        doc=doc.lower()
        doc_cleaned=' '.join(lemmatizer.lemmatize(word) for word in doc.split()
                            if is_letter_only(word) and word not in all_names and word not in stop_words)
        docs_cleaned.append(doc_cleaned)
    return docs_cleaned

In [4]:
fetch_20newsgroups

<function sklearn.datasets._twenty_newsgroups.fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False)>

In [5]:
#EXPERIMENT-1:Binary Classifier
#The model is able to classify between the below mentioned categories
# Binary classification


categories=['comp.graphics','sci.space']

#classifying into train and test data
data_train= fetch_20newsgroups(subset='train', categories= categories, random_state=42)
data_test= fetch_20newsgroups(subset='test', categories=categories, random_state=42)




array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [12]:
#Now we cleaning the train and test data by calling the clean_text function
cleaned_train= clean_text(data_train.data)
label_train= data_train.target
label_train

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [6]:
#cleaning the test data
cleaned_test= clean_text(data_test.data)
label_test= data_test.target
label_test

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,

In [None]:
#Once the data is converted into root form(i.e data cleaned) it should be converted into numeric form 
#one of the technique used here is:
#TfidVectorizer:Term Frequency Inverse Document Frequency
#it'll try to identify what is the frequency of a given word in a given document and across the number of documents and it'll
#create a matrix which will show the frequency of particular word or term

In [7]:
# TFidfvectorizer- term frequency inverse document frequency

from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [8]:
tfidf_vectorizer= TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train= tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test= tfidf_vectorizer.transform(cleaned_test)

In [9]:
from sklearn.svm import SVC

svm= SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy= svm.score(term_docs_test, label_test)
print("The accuracy of binary classification is : {0:.1f}%".format(accuracy*100))

The accuracy of binary classification is : 96.4%


In [None]:
#From the above answer we get to know that the model is 96.4% accurate


In [None]:
#EXPERIMENT-2:Multi-class Classifier
#The model is able to classify between the below mentioned categories


In [15]:
#The same process is repeated as we did for binary-class classifier but the change is in the number of categories as mentioned
categories=[
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
    'rec.sport.hockey'
]

In [16]:
#seperating train and test datasets
data_train= fetch_20newsgroups(subset='train', categories= categories, random_state=42)
data_test= fetch_20newsgroups(subset='test', categories=categories, random_state=42)


In [17]:
#cleaning the data
cleaned_train= clean_text(data_train.data)
label_train= data_train.target

cleaned_test= clean_text(data_test.data)
label_test= data_test.target

In [18]:
tfidf_vectorizer= TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train= tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test= tfidf_vectorizer.transform(cleaned_test)


In [19]:
svm= SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy= svm.score(term_docs_test, label_test)
print("The accuracy of 5-class classification is : {0:.1f}%".format(accuracy*100))

from sklearn.metrics import classification_report
prediction= svm.predict(term_docs_test)
report=classification_report(label_test, prediction)
print(report)

The accuracy of 5-class classification is : 88.6%
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       319
           1       0.92      0.96      0.94       389
           2       0.98      0.96      0.97       399
           3       0.93      0.94      0.93       394
           4       0.74      0.73      0.73       251

    accuracy                           0.89      1752
   macro avg       0.87      0.87      0.87      1752
weighted avg       0.89      0.89      0.89      1752



In [None]:
#we come to know the accuracy of multi-class classifier is 88.6%

In [None]:
#If we want the model to classify for all the 20 categories:

In [20]:
data_train= fetch_20newsgroups(subset='train', categories= None, random_state=42)
data_test= fetch_20newsgroups(subset='test', categories=None, random_state=42)

In [21]:
cleaned_train= clean_text(data_train.data)
label_train= data_train.target

cleaned_test= clean_text(data_test.data)
label_test= data_test.target

tfidf_vectorizer= TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train= tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test= tfidf_vectorizer.transform(cleaned_test)

svm= SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy= svm.score(term_docs_test, label_test)
print("The accuracy of 20-class classification is : {0:.1f}%".format(accuracy*100))

from sklearn.metrics import classification_report
prediction= svm.predict(term_docs_test)
report=classification_report(label_test, prediction)
print(report)

The accuracy of 20-class classification is : 78.9%
              precision    recall  f1-score   support

           0       0.70      0.66      0.68       319
           1       0.65      0.78      0.71       389
           2       0.72      0.66      0.69       394
           3       0.65      0.74      0.69       392
           4       0.81      0.77      0.79       385
           5       0.77      0.68      0.72       395
           6       0.70      0.83      0.76       390
           7       0.86      0.85      0.86       396
           8       0.92      0.90      0.91       398
           9       0.92      0.91      0.92       397
          10       0.96      0.93      0.94       399
          11       0.95      0.86      0.90       396
          12       0.69      0.73      0.71       393
          13       0.81      0.88      0.84       396
          14       0.92      0.86      0.89       394
          15       0.73      0.85      0.79       398
          16       0.69      0