In [None]:
import pandas as pd
import numpy as np

In [None]:
import pyprind
import pandas as pd
import os
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                'r', encoding='utf-8') as infile:
            txt = infile.read()
            df = df.append([[txt, labels[l]]],
            ignore_index=True)
        pbar.update()
df.columns = ['review', 'sentiment']

In [None]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)
df.shape

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
'The weather is sweet',
'The sun is shining, the weather is sweet,'
'and one and one is two'])
bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary_)
{'and': 0,
'two': 7,
'shining': 3,
'one': 2,
'sun': 4,
'weather': 8,
'the': 6,
'sweet': 5,
'is': 1}

In [None]:
print(bag.toarray())
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
norm='l2',
smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs))
 toarray())
[[ 0. 0.43 0. 0.56 0.56 0. 0.43 0. 0. ]
 [ 0. 0.43 0. 0. 0. 0.56 0.43 0. 0.56]
 [ 0.5 0.45 0.5 0.19 0.19 0.19 0.3 0.25 0.19]]

In [None]:
import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text)
text = (re.sub('[\W]+', ' ', text.lower()) +
' '.join(emoticons).replace('-', ''))
return text

In [None]:
preprocessor(df.loc[0, 'review'][-50:])
'is seven title brazil not available'
preprocessor("</a>This :) is :( a test :-)!")
'this is a test :) :( :)'
df['review'] = df['review'].apply(preprocessor)

In [None]:
def tokenizer(text):
return text.split()
tokenizer('runners like running and thus they run')
['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes'
' running and runs a lot')[-10:]
if w not in stop]
['runner', 'like', 'run', 'run', 'lot']

In [None]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer,
tokenizer_porter],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer,
tokenizer_porter],
'vect__use_idf':[False],
'vect__norm':[None],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]}
]
lr_tfidf = Pipeline([('vect', tfidf),
('clf',
LogisticRegression(random_state=0,
solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
scoring='accuracy',
cv=5, verbose=2,
n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
Best parameter set: {'clf__C': 10.0, 'vect__stop_words': None,
'clf__penalty': 'l2', 'vect__tokenizer': <function tokenizer at
0x7f6c704948c8>, 'vect__ngram_range': (1, 1)}

In [None]:
print('CV Accuracy: %.3f'
% gs_lr_tfidf.best_score_)
CV Accuracy: 0.897
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'
% clf.score(X_test, y_test))

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text.lower())
text = re.sub('[\W]+', ' ', text.lower()) \
+ ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized

In [None]:
def stream_docs(path):
with open(path, 'r', encoding='utf-8') as csv:
next(csv) # skip header
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label
next(stream_docs(path='movie_data.csv'))
('"In 1974, the teenager Martha Moxley ... ',1)

In [None]:
def get_minibatch(doc_stream, size):
docs, y = [], []
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [None]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))
clf = clf.partial_fit(X_test, y_test)

In [None]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
max_df=.1,
max_features=5000)
X = count.fit_transform(df['review'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
random_state=123,
learning_method='batch')
X_topics = lda.fit_transform(X)
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
print("Topic %d:" % (topic_idx + 1))
print(" ".join([feature_names[i]
for i in topic.argsort()\
[:-n_top_words - 1:-1]]))

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
print('\nHorror movie #%d:' % (iter_idx + 1))
print(df['review'][movie_idx][:300], '...')

In [None]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
os.makedirs(dest)
pickle.dump(stop,
open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
protocol=4)
pickle.dump(clf,
open(os.path.join(dest, 'classifier.pkl'), 'wb'),
protocol=4)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(
 cur_dir, 'pkl_objects', 'stopwords.pkl'),
 'rb'))
def tokenizer(text):
 text = re.sub('<[^>]*>', '', text)
 emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
 text.lower())
 text = re.sub('[\W]+', ' ', text.lower()) \
 + ' '.join(emoticons).replace('-', '')
 tokenized = [w for w in text.split() if w not in stop]
 return tokenized

vect = HashingVectorizer(decode_error='ignore',
 n_features=2**21,
 preprocessor=None,
 tokenizer=tokenizer)

In [None]:
import pickle
import re
import os
from vectorizer import vect
clf = pickle.load(open(os.path.join(
'pkl_objects', 'classifier.pkl'),
'rb'))
import numpy as np
label = {0:'negative', 1:'positive'}
example = ["I love this movie. It's amazing."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %\
(label[clf.predict(X)[0]],
np.max(clf.predict_proba(X))*100))

In [None]:
import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db'\
' (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db"\
" (review, sentiment, date) VALUES"\
" (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db"\
" (review, sentiment, date) VALUES"\
" (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()

In [None]:
>>> conn = sqlite3.connect('reviews.sqlite')
>>> c = conn.cursor()
>>> c.execute("SELECT * FROM review_db WHERE date"\
... " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
>>> results = c.fetchall()
>>> conn.close()
>>> print(results)

In [None]:
conda install flask
# or: pip install flask
1st_flask_app_1/
 app.py
 templates/
 first_app.html

from flask import Flask, render_template
app = Flask(__name__)
@app.route('/')
def index():
 return render_template('first_app.html')
if __name__ == '__main__':
 app.run()

In [None]:
<!doctype html>
<html>
 <head>
 <title>First app</title>
 </head>
 <body>
 <div>Hi, this is my first Flask web app!</div>
 </body>
</html>

In [None]:
conda install wtforms
# or pip install wtforms

1st_flask_app_2/
 app.py
 static/
 style.css
 templates/
 _formhelpers.html
 first_app.html
 hello.html

In [None]:
from flask import Flask, render_template, request
from wtforms import Form, TextAreaField, validators
app = Flask(__name__)
class HelloForm(Form):
 sayhello = TextAreaField('',[validators.DataRequired()])
@app.route('/')
def index():
 form = HelloForm(request.form)
 return render_template('first_app.html', form=form)
@app.route('/hello', methods=['POST'])
def hello():
 form = HelloForm(request.form)
 if request.method == 'POST' and form.validate():
 name = request.form['sayhello']
 return render_template('hello.html', name=name)
 return render_template('first_app.html', form=form)
if __name__ == '__main__':
 app.run(debug=True)

In [None]:
{% macro render_field(field) %}
 <dt>{{ field.label }}
 <dd>{{ field(**kwargs)|safe }}
 {% if field.errors %}
 <ul class=errors>
 {% for error in field.errors %}
 <li>{{ error }}</li>
 {% endfor %}
 </ul>
 {% endif %}
 </dd>
 </dt>
{% endmacro %}

In [None]:
body {
 font-size: 2em;
}

<!doctype html>
<html>
 <head>
 <title>First app</title>
 <link rel="stylesheet"
 href="{{ url_for('static', filename='style.css') }}">
 </head>
 <body>
 {% from "_formhelpers.html" import render_field %}
 <div>What's your name?</div>
 <form method=post action="/hello">
 <dl>
 {{ render_field(form.sayhello) }}
 </dl>
 <input type=submit value='Say Hello' name='submit_btn'>
 </form>
 </body>
</html>

In [None]:
<!doctype html>
<html>
 <head>
 <title>First app</title>
 <link rel="stylesheet"
 href="{{ url_for('static', filename='style.css') }}">
 </head>
 <body>
 <div>Hello {{ name }}</div>
 </body>
</html>

In [None]:
from flask import Flask, render_template, request
from wtforms import Form, TextAreaField, validators
import pickle
import sqlite3
import os
import numpy as np
# import HashingVectorizer from local dir
from vectorizer import vect
app = Flask(__name__)
######## Preparing the Classifier
cur_dir = os.path.dirname(__file__)
clf = pickle.load(open(os.path.join(cur_dir,
 'pkl_objects', 'classifier.pkl'),
 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')
def classify(document):
 label = {0: 'negative', 1: 'positive'}
 X = vect.transform([document])
 y = clf.predict(X)[0]
 proba = np.max(clf.predict_proba(X))
 return label[y], proba
def train(document, y):
 X = vect.transform([document])
 clf.partial_fit(X, [y])
def sqlite_entry(path, document, y):
 conn = sqlite3.connect(path)
 c = conn.cursor()
 c.execute("INSERT INTO review_db (review, sentiment, date)"\
 " VALUES (?, ?, DATETIME('now'))", (document, y))
 conn.commit()
 conn.close()


In [None]:
######## Flask
class ReviewForm(Form):
 moviereview = TextAreaField('',
 [validators.DataRequired(),
 validators.length(min=15)])
@app.route('/')
def index():
 form = ReviewForm(request.form)
 return render_template('reviewform.html', form=form)
@app.route('/results', methods=['POST'])
def results():
 form = ReviewForm(request.form)
 if request.method == 'POST' and form.validate():
 review = request.form['moviereview']
 y, proba = classify(review)
 return render_template('results.html',
 content=review,
 prediction=y,
 probability=round(proba*100, 2))
 return render_template('reviewform.html', form=form)
@app.route('/thanks', methods=['POST'])
def feedback():
 feedback = request.form['feedback_button']
 review = request.form['review']
 prediction = request.form['prediction']

 inv_label = {'negative': 0, 'positive': 1}
 y = inv_label[prediction]
 if feedback == 'Incorrect':
 y = int(not(y))
 train(review, y)
 sqlite_entry(db, review, y)
 return render_template('thanks.html')

In [None]:
<!doctype html>
<html>
 <head>
 <title>Movie Classification</title>
 <link rel="stylesheet"
 href="{{ url_for('static', filename='style.css') }}">
 </head>
 <body>

 <h2>Please enter your movie review:</h2>

 {% from "_formhelpers.html" import render_field %}

 <form method=post action="/results">
 <dl>
 {{ render_field(form.moviereview, cols='30', rows='10') }}
 </dl>
[ 306 ]
Embedding a Machine Learning Model into a Web Application
 <div>
 <input type=submit value='Submit review' name='submit_btn'>
 </div>
 </form>

 </body>
</html>

In [None]:
<!doctype html>
<html>
 <head>
 <title>Movie Classification</title>
 <link rel="stylesheet"
 href="{{ url_for('static', filename='style.css') }}">
 </head>
 <body>

 <h3>Your movie review:</h3>
<div>{{ content }}</div>

 <h3>Prediction:</h3>
 <div>This movie review is <strong>{{ prediction }}</strong>
 (probability: {{ probability }}%).</div>

 <div id='button'>
 <form action="/thanks" method="post">
 <input type=submit value='Correct' name='feedback_button'>
 <input type=submit value='Incorrect' name='feedback_button'>
 <input type=hidden value='{{ prediction }}' name='prediction'>
 <input type=hidden value='{{ content }}' name='review'>
 </form>
 </div>

 <div id='button'>
 <form action="/">
 <input type=submit value='Submit another review'>
 </form>
 </div>

 </body>
</html>
body{
 width:600px;
}
.button{
 padding-top: 20px;
}

In [None]:
<!doctype html>
<html>
 <head>
 <title>Movie Classification</title>
 <link rel="stylesheet"
 href="{{ url_for('static', filename='style.css') }}">
 </head>
 <body>

 <h3>Thank you for your feedback!</h3>

 <div id='button'>
 <form action="/">
 <input type=submit value='Submit another review'>
 </form>
 </div>

 </body>
</html>

In [None]:
import pickle
import sqlite3
import numpy as np
import os
# import HashingVectorizer from local dir
from vectorizer import vect
def update_model(db_path, model, batch_size=10000):
 conn = sqlite3.connect(db_path)
 c = conn.cursor()
 c.execute('SELECT * from review_db')

 results = c.fetchmany(batch_size)
 while results:
 data = np.array(results)
 X = data[:, 0]
 y = data[:, 1].astype(int)

 classes = np.array([0, 1])
 X_train = vect.transform(X)
 model.partial_fit(X_train, y, classes=classes)
 results = c.fetchmany(batch_size)

 conn.close()
 return model
cur_dir = os.path.dirname(__file__)
clf = pickle.load(open(os.path.join(cur_dir,
 'pkl_objects',
 'classifier.pkl'), 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')
clf = update_model(db_path=db, model=clf, batch_size=10000)
# Uncomment the following lines if you are sure that
# you want to update your classifier.pkl file
# permanently.
# pickle.dump(clf, open(os.path.join(cur_dir,
# 'pkl_objects', 'classifier.pkl'), 'wb'),
# protocol=4)

In [None]:
# import update function from local dir
from update import update_model
if __name__ == '__main__':
 clf = update_model(db_path=db,
 model=clf,
 batch_size=10000)

In [None]:
pickle.dump(
 clf, open(
 os.path.join(
 cur_dir, 'pkl_objects',
 'classifier.pkl'),
 'wb'),
 protocol=4)
timestr = time.strftime("%Y%m%d-%H%M%S")
orig_path = os.path.join(
 cur_dir, 'pkl_objects', 'classifier.pkl')
backup_path = os.path.join(
 cur_dir, 'pkl_objects',
 'classifier_%s.pkl' % timestr)
copyfile(orig_path, backup_path)
