## Model selection
> Selecting models' hyper parameters

## Text Classification Problem

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import datasets

In [3]:
news = datasets.fetch_20newsgroups(subset='all')

In [8]:
X = news.data
y = news.target

### Split the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=.25)

### Bag of words

In [43]:
vectorizer = CountVectorizer(stop_words='english', binary=True)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Creating a Classifier with a Pipeline

In [44]:
clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', binary=True)),
    ('clf', MultinomialNB())
])

In [45]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(binary=True, stop_words='english')),
                ('clf', MultinomialNB())])

In [46]:
clf.score(X_test, y_test)

0.8758488964346349

In [47]:
clf.score(X_train, y_train)

0.9607329842931938

## Creating a Model Without a pipeline

In [48]:
model = MultinomialNB()

In [49]:
model.fit(X_train_vectors, y_train)

MultinomialNB()

In [50]:
model.score(X_train_vectors, y_train)

0.9607329842931938

In [51]:
model.score(X_test_vectors, y_test)

0.8758488964346349

In [52]:
x_pred_vectors = vectorizer.transform(X_test[:5])

In [53]:
model.predict(x_pred_vectors), y_test[:5]

(array([ 8, 15, 13,  7, 18]), array([ 8, 15, 13, 12, 19]))

In [None]:
## The GridSearchCV
> This helps us to find the best parameters for our classifiers

In [55]:
from sklearn.model_selection import GridSearchCV

In [70]:
parameters = {
 'clf__fit_prior': np.array([True, False]),
  'clf__alpha': np.array([0,  .5, 1]),
 }
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
gs = GridSearchCV(clf, parameters, verbose=2, refit=False, cv=3)

In [68]:
clf.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()), ('clf', MultinomialNB())],
 'verbose': False,
 'vect': CountVectorizer(),
 'clf': MultinomialNB(),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'clf__alpha': 1.0,
 'clf__class_prior': None,
 'clf__fit_prior': True}

In [71]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV] END ................clf__alpha=0.0, clf__fit_prior=True; total time=   7.0s




[CV] END ................clf__alpha=0.0, clf__fit_prior=True; total time=   6.2s




[CV] END ................clf__alpha=0.0, clf__fit_prior=True; total time=   7.1s




[CV] END ...............clf__alpha=0.0, clf__fit_prior=False; total time=   6.8s




[CV] END ...............clf__alpha=0.0, clf__fit_prior=False; total time=   6.8s




[CV] END ...............clf__alpha=0.0, clf__fit_prior=False; total time=   6.5s
[CV] END ................clf__alpha=0.5, clf__fit_prior=True; total time=   6.7s
[CV] END ................clf__alpha=0.5, clf__fit_prior=True; total time=   6.7s
[CV] END ................clf__alpha=0.5, clf__fit_prior=True; total time=   6.3s
[CV] END ...............clf__alpha=0.5, clf__fit_prior=False; total time=   6.7s
[CV] END ...............clf__alpha=0.5, clf__fit_prior=False; total time=   6.7s
[CV] END ...............clf__alpha=0.5, clf__fit_prior=False; total time=   6.4s
[CV] END ................clf__alpha=1.0, clf__fit_prior=True; total time=   6.3s
[CV] END ................clf__alpha=1.0, clf__fit_prior=True; total time=   6.3s
[CV] END ................clf__alpha=1.0, clf__fit_prior=True; total time=   6.7s
[CV] END ...............clf__alpha=1.0, clf__fit_prior=False; total time=   7.0s
[CV] END ...............clf__alpha=1.0, clf__fit_prior=False; total time=   6.4s
[CV] END ...............clf_

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('clf', MultinomialNB())]),
             param_grid={'clf__alpha': array([0. , 0.5, 1. ]),
                         'clf__fit_prior': array([ True, False])},
             refit=False, verbose=2)

In [72]:
gs.best_score_

0.861680680995976

In [73]:
gs.best_params_

{'clf__alpha': 0.0, 'clf__fit_prior': False}

### Summary

* **Semi-supervised learning**  methods are the middle ground between
supervised and unsupervised learning. They combine small amounts of
annotated data with huge amounts of unlabeled data. Usually, unlabeled
data can reveal the underlying distribution of elements and obtain better
results in combination with a small, labeled dataset.
* ** Active learning** is a particular case within semi-supervised methods. Again,
it is useful when labeled data is scarce or hard to obtain. In active learning,
the algorithm actively queries a human expert to answer the label of certain
unlabeled instances, and thus learn the concept over a reduced set of labeled
instances.
* **Reinforcement learning** proposes methods where an agent learns from
feedback (rewards or reinforcements) after performing actions within an
environment. The agent learns to perform a task by trying to maximize the
cumulative reward. These methods have been very successful in robotics and
video games.
* **Sequential classification** (very commonly used in **Natural Language
Processing (NLP)**) assigns a sequence of labels to a sequence of items; for
example, the parts of speech of the words in a sentence