# Chapter 6. NLP
# Part 1. Basis

## - Data preparation (very slow)

In [26]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split

reviews_train = load_files("imdb/aclimdb/train")
reviews_test = load_files("imdb/aclimdb/train")

text_train, y_train = reviews_train.data, reviews_train.target
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

text_test, y_test = reviews_test.data, reviews_test.target
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

## - Bag of words

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,", "but the wise man knows himself to be a fool"]
vect = CountVectorizer().fit(bards_words)
bag_of_words = vect.transform(bards_words)

print("Vocabulary size: {}".format( len( vect.vocabulary_)))
print("Vocabulary content: \n{}".format( vect.vocabulary_))
print("Sparse matrix content: \n{}".format( bag_of_words.toarray()))

Vocabulary size: 13
Vocabulary content: 
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}
Sparse matrix content: 
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


^ Sparse matrix shows how many tockens there is in document

## - Movie review tonality

#### BoW model init and build

In [28]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format( repr( X_train)))

X_train:
<75000x124255 sparse matrix of type '<class 'numpy.int64'>'
	with 10315542 stored elements in Compressed Sparse Row format>


#### Check vocabulary (detailed)

In [29]:
feature_names = vect.get_feature_names()

print("Number of features: {}\n".format( len( feature_names)))
print("First 5 features: {}\n".format( feature_names[:5]))
print("Middle 5 features: {}\n".format(feature_names[20015:20020]))
print("Every 10000th feature: {}\n".format(feature_names[::10000]))

Number of features: 124255

First 5 features: ['00', '000', '0000', '0000000000000000000000000000000001', '0000000000001']

Middle 5 features: ['cheapest', 'cheapie', 'cheapies', 'cheapjack', 'cheaply']

Every 10000th feature: ['00', 'banquière', 'chcialbym', 'devagan', 'fetiches', 'heathen', 'kerchner', 'meistersinger', 'overwhelmingly', 'recreating', 'silveira', 'themself', 'weidler']



#### Logreg model init and build

1) CV training model. EXTREMELY LONG

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression( max_iter=99999999), X_train, y_train, cv=5)
print("Mean cv quality: {}".format( np.mean( scores)))

^ Accuracy=0.88. Probably could be better.

2) GridSearching best 'C' param. EXTREMELY LONG

In [40]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.001,0.01,0.1,1,10]}

In [31]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5).fir(X_train, y_train)
print('Best cv value: {:.2f}'.format(grid.best_score_))
print('best params: {}', grid.best_params_)

^ Thus best 'C'=0.1

In [None]:
X_test = vect.transform(text_test)
print('Quality: {:2.f}'.format(grid.score(X_test,y_test)))

^ Accuracy=0.88. Same.

3) Reducing number of worthless tokens and applying to model

In [32]:
vect = CountVectorizer( min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print('X_train c min_df: {}'.format( repr( X_train)))

X_train c min_df: <75000x44532 sparse matrix of type '<class 'numpy.int64'>'
	with 10191240 stored elements in Compressed Sparse Row format>


^ Feature number decreased in 3 times

Check vocabulary (detailed)

In [33]:
feature_names = vect.get_feature_names()

print("Number of features: {}\n".format( len( feature_names)))
print("First 5 features: {}\n".format( feature_names[:5]))
print("Middle 5 features: {}\n".format(feature_names[20015:20020]))
print("Every 10000th feature: {}\n".format(feature_names[::10000]))

Number of features: 44532

First 5 features: ['00', '000', '001', '007', '00am']

Middle 5 features: ['inevitable', 'inevitably', 'inexcusable', 'inexcusably', 'inexhaustible']

Every 10000th feature: ['00', 'deck', 'ineffectually', 'policies', 'tinkles']



Applying to model

In [None]:
grid = GridSearchCV( LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print('Best CV accuracy: {}'.format( grid.best_score_))

^ Accuracy hasn't increased. Feature processing may increase algorithm's speed and result's interpretability though.