# Statistical NLP

## News Categorization
Let's write an algorithm that automatically categorizes news types.

In [1]:
# Let's select the categories we'd like to exist in our dataset
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) # getting the data, randomly shuffling the dataset

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
twenty_train.target_names # chekcing if the dataset has our categories as its target

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
# size of the training data
len(twenty_train.data)
len(twenty_train.filenames)

2257

In [5]:
# let's take a look at the initial lines of a datapoint
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [6]:
# fetching the categoty of the first datapoint in the training data
twenty_train.target_names[twenty_train.target[0]]

'comp.graphics'

In [7]:
# let's see the targets:
twenty_train.target[:10] # these are the numerical representations of each news category correspondind with your dataset

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [8]:
# now let's see target names corresponding to these numeric categories
for t in twenty_train.target[:10]:
  print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


## Bag of Words
Bag of words representation is a powerful feature representation for text classification expecially when the sequence is short. 

Watch out: the vectors can get very big, since they can correspond to the entire vocabulary. 

In [9]:
# tokenizing with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [10]:
# getting the index of a vocabulary item
count_vect.vocabulary_.get(u'algorithm')

4690

## TF-IDF: Avoiding the frequency effect 

In [11]:
# term frequency matrix
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [12]:
# TF-IDF matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

## Classification

### Training

In [13]:
# picking a classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [14]:
# Inference using the trained classifier
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
  print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Building an ML Pipeline

In [15]:
# building the pipeline object. The keys are useful for parameter tuning
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])

In [16]:
# training using the pipeline object
text_clf.fit(twenty_train.data, twenty_train.target)


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

## Evaluating a Model

In [17]:
# testing the model performance
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [18]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))
metrics.confusion_matrix(twenty_test.target, predicted)

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

## Parameter Tuning: Grid Search

In [22]:
# defining the grid
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3),}

In [23]:
# using all the CPUs
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [24]:
# picking a smaller subset of the data just for illustration purposes. In real world scenario's it's essential to tune the parameters on the entire training set.
# training the grid-search-optimized classifier on the subset and storing the model in memory 
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [25]:

twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [26]:
# the best parameter set and score are accessible to retrieve
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
  print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9349999999999999
clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)


# Exercise 4
## 1. Try the experiment using a different classifier. Did the results improve? Can you reason why?
## 2. Use word embeddings instead of tf-idf vectors and retrain your classifier. Do the resuslts improve? 
## --Hint: tf-idf vectors represent documents. Vanilla word embeddings represent words. A simple way to create word embeddings of an entire document is to average all the words. It's a good practice to filter out functions words (to, at, is, with, etc.) since they do not have substantial semantics associated with them.

