# 20 news group classification

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## Data Search

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [7]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [11]:
len(news.data)

18846

In [12]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

# train/test data extraction

In [14]:
train_news = fetch_20newsgroups(
    subset='train', random_state=2021,
    remove=('headers', 'footers', 'quotes')
)
len(train_news.data)

11314

In [15]:
test_news = fetch_20newsgroups(
    subset='test', random_state=2021,
    remove=('headers', 'footers', 'quotes')
)
len(test_news.data)

7532

### text data preprocessing

In [23]:
train_df = pd.DataFrame({'article': train_news.data})
test_df = pd.DataFrame({'article' : test_news.data})
train_df, test_df

(                                                 article
 0      \nStop! Hold it! You have a few problems here....
 1      ]Is it possible to do a "wheelie" on a motorcy...
 2                           \n\nBBS number\n510-226-2365
 3      : [first post I've seen from the ol' Bug-Zoo (...
 4      Archive-name: rec-autos/part5\n\n[this article...
 ...                                                  ...
 11309  While I enjoy the trend towards the more class...
 11310  \nyou can say that again.\nhow does $23 for a ...
 11311  If you can get it, you might want to try a Can...
 11312  \n\nWhy would you say "especially Christianity...
 11313  \n\tOn a completely different tack, what was t...
 
 [11314 rows x 1 columns],
                                                 article
 0     Need Diet for Diverticular Disease\nand ideas ...
 1     There are chips which perform the voice compre...
 2     Total Baseball, which also tries to evaluate a...
 3     If anyone would like to get rid of their

- Train dataset

In [26]:
# remove Special Characters
import re 
train_df['article'] = train_df.article.str.replace('[^a-zA-Z]', ' ')
train_df.article[1]

' Is it possible to do a  wheelie  on a motorcycle with shaft drive   yes  '

In [None]:
def ch(li):
  x = []
  for i in li.split():
    lf len(i) > 3:
    x.append(i)
  s = ' '.join([])

In [35]:
train_df['article'] = train_df['article'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
train_df['article'][1]

'possible wheelie motorcycle with shaft drive'

In [37]:
# convert small letter
train_df['article'] = train_df['article'].apply(lambda x: x.lower())

In [38]:
# remove Special Characters and convert small letter
train_df['article'] = train_df['article'].apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

- test dataset

In [39]:
test_df['article'] = test_df['article'].apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))

## text convert

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df['article'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [42]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
X_train.shape, X_test.shape

((11314, 64133), (7532, 64133))

In [43]:
y_train = train_news.target
y_test = test_news.target

In [44]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [46]:
pred = svc.predict(X_test)

In [47]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.6488316516197558