## Classification of Articles
The data consists of a collection of articles on topics, including baseball, cryptography, electronics, hardware, medicine, mideast, motorcycles, politics, religion, and space. The posts are extracted from the 20 Newsgroups dataset.

This notebook do the preprocessing, and predicts the topics from this collection of texts using a supervised machine learning algorithm.

In [42]:
import spacy
import en_core_web_sm
import en_core_web_md
import spacy
import pandas as pd
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [24]:
## Read the data
X = pd.read_csv("data/X_train.csv")
y = pd.read_csv("data/y_train.csv")

In [48]:
## Look at the data

print('X_shape: {}'.format(X.shape))
print('y_shape: {}'.format(y.shape))
print('n_categories: {}'.format(y['label'].nunique()))
print('categories: {}'.format(y['label'].unique()))

X_shape: (6384, 1)
y_shape: (6384, 1)
n_categories: 10
categories: ['baseball' 'cryptography' 'electronics' 'hardware' 'medicine' 'mideast'
 'motorcycles' 'politics' 'religion' 'space']


In [25]:
# Split data into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
## Spacy tokenizer

nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def spacy_tokenizer(text):
    tokens = nlp(text)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens


In [27]:
## Vectorizer with above spacy and Scikit learn

preprocessor = ColumnTransformer([("processing", TfidfVectorizer(tokenizer=spacy_tokenizer), "text")])

In [29]:
%%time
parameters = {'alpha': (0.05, 0.07, 0.1), 'fit_prior': [True, False]}
clf = GridSearchCV(MultinomialNB(), parameters, cv=3)

model = Pipeline([("preprocessor", preprocessor), ("model", clf)]).fit(X_train,y_train['label'])

y_pred = model.predict(X_test)
score_grid_MNB = accuracy_score(y_test['label'], y_pred)

print(score_grid_MNB)

0.9226388229710489
Wall time: 7min 1s


In [41]:
## Vectorizer without spacy and pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75,1),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__alpha': (0.07, 0.08, 0.09, 0.1),
    'clf__fit_prior': [True, False]}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3)
grid_search_tune.fit(X_train['text'], y_train['label'])


y_pred_pipe = grid_search_tune.predict(X_test['text'])
score_pip = accuracy_score(y_test['label'], y_pred_pipe)

print(score_pip)

0.9245372567631703
