# Working with Text data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

#### Applying bag-of-words to a toy dataset

In [None]:
bards_words = ["The fool doth think he is wise,",
               "but the wise man knows himself to be a fool"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)

In [None]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

In [None]:
bag_of_words = vect.transform(bards_words)
print("bag_of_words: {}".format(repr(bag_of_words)))

In [None]:
print("Dense representation of bag_of_words:\n{}".format(
      bag_of_words.toarray()))

In [None]:
vect.get_feature_names()

In [None]:
vect.inverse_transform(bag_of_words)

### Task 1
Compute bigrams and trigrams of words as well. How does that change the vocabulary size? How would you imagine this changes the vocabulary size in a real application?

# Download data from http://ai.stanford.edu/~amaas/data/sentiment/
# Delete the ``train/unsup`` folder.

# Sentiment analysis of movie reviews

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [None]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

In [None]:
reviews_test = load_files("aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

### Representing text data as Bag of Words

![bag_of_words](bag_of_words.png)

### Task 2

Use the ``CountVectorizer`` to build a vocabulary and create a bag of word representation of the training data.

How big is the vocabulary?

Display some of the words in the vocabulary using ``get_feature_names()``.
What is 5 most common words?
What is the 5 most common words with ``stop_words='english'``?

In [None]:
vect = CountVectorizer()
# ... solution here ...

### Task 3
Build a ``LogisticRegression`` model on the dataset.
Extract the features with the largest coefficients (10 most positive and 10 most negative) and visualize them in a bar plot.
Do these make sense?

Then evaluate the model on the test set.

In [None]:
from sklearn.linear_model import LogisticRegression
# ...

### Task 4
Use stop words and a minimum document frequency to limit the number of features. How does that impact the result?
Then add bigrams.