In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.5.1

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 58kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 42.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=20923c797d2c6e410c328b695fce3c8ce6ffaa1b3e759afbb542d488e8ad9b58
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

In [2]:
! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2020-09-14 21:06:30--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-09-14 21:06:33 (37.6 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
! tar xzf aclImdb_v1.tar.gz

Sentiment analysis ins one of the most popular uses of NLP that allows to leverage the data from websites with comments and ratings to learn the relationship betwee the language used in positive or negative sentiment.

The objective of this study is to build a model that evaluates movie reviews. Many movie reviewers use some quantifiable metrics such as thumbs up/down or stars. However, there are some reviewers who might use different metrics (a 10 point scale) or no metrics at all. It might be bettere if a model is built to look at the reviews and produce a score based on the text of the review instead of on an ad-hoc score.


In [4]:
# import the necessary libraries

import sparknlp

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *

In [5]:
spark = sparknlp.start()

In [6]:
pos_train = spark.sparkContext.wholeTextFiles('aclImdb/train/pos/')
neg_train = spark.sparkContext.wholeTextFiles('aclImdb/train/neg')
pos_test = spark.sparkContext.wholeTextFiles('aclImdb/test/pos')
neg_test = spark.sparkContext.wholeTextFiles('aclImdb/test/neg')

In [7]:
pos_train = spark.createDataFrame(pos_train, ['path', 'text'])
pos_train = pos_train.repartition(100)
pos_train = pos_train.withColumn('label', lit(1)).persist()
neg_train = spark.createDataFrame(neg_train, ['path', 'text'])
neg_train = neg_train.repartition(100)
neg_train = neg_train.withColumn('label', lit(0)).persist()
pos_test = spark.createDataFrame(pos_test, ['path', 'text'])
pos_test = pos_test.repartition(100)
pos_test = pos_test.withColumn('label', lit(1)).persist()
neg_test = spark.createDataFrame(neg_test, ['path', 'text'])
neg_test = neg_test.repartition(100)
neg_test = neg_test.withColumn('label', lit(0)).persist()

In [8]:
print(pos_train.first()['text'])

Finally got to see this classic TV movie on an unofficial disc recorded from an old VHS, it is a classic piece of horror. Its a pity more of this neglected corner of horror in terms of official releases on DVD and VHS ... the TV horror movie. Recommended for all fans of the 70's TV movie much like trilogy of terror. Those interested should get the book on the subject by David Deal - Television Fright Films of the 70's. Email me for a chance to see it.....its fabulous to see it again.<br /><br />It does have it problems like many TV movies they have to be rather inventive in the effects dept and even at 70 mins it can seem to drag possibly we are all used to more modern editing but still great stuff and far better than many theatrical frights released today.


This seems like a clearly positive review. It is possible to identify a few words that seem like a good signal, like 'enjoyed'.

In [9]:
print(neg_train.first()['text'])

Lorne Michaels once again proves that he has absolutely no business producing movies.<br /><br />You'd think that after such dismal flicks "Superstar", "Night at the Roxbury", and "Coneheads", he'd start to get the notion that maybe he doesn't know what he's doing when it comes to movies (and many would argue that he doesn't know what he's doing when it comes to television, as well). Trying to make feature films out of skits that wore out their welcome the third time the were done on SNL makes no sense.<br /><br />I personally like Tim Meadows, and think that he would be great in the right movie. It's a shame to see a talented guy wasted in a film that features unfunny after unfunny situation, and caps it all with a dreadfully bad song and dance scene. Any laughs here will be because the movie is so bad, not because it's funny.<br /><br />Oh well, at least we can be thankful that there are many other tired SNL characters who will never have films done about them. It's just too bad that

Here is an example of a negative review.

There are also some HTML tags that need to be removed.

In [10]:
# corpus size
print('pos_train size', pos_train.count())
print('neg_train size', neg_train.count())
print('pos_test size', pos_test.count())
print('neg_test size', neg_test.count())

pos_train size 12500
neg_train size 12500
pos_test size 12500
neg_test size 12500


So there are 50000 documents.

In [11]:
# length of the text
pos_train.selectExpr('length(text) AS text_len').toPandas().describe()

Unnamed: 0,text_len
count,12500.0
mean,1347.16024
std,1046.747365
min,70.0
25%,695.0
50%,982.0
75%,1651.0
max,13704.0


There is a lot of variation in character lenghts.

Text length may seem very low level. 

Longer comments may be more likely to be negative.


# **Building the model**

In [12]:
# combining postives and negatives into two data sets: train and test
train = pos_train.unionAll(neg_train)
test = pos_test.unionAll(neg_test)

In [13]:
# creating pipeline
assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentences')
tokenizer = Tokenizer().setInputCols('sentences').setOutputCol('tokens')
lemmatizer = LemmatizerModel.pretrained().setInputCols('tokens').setOutputCol('lemmas')
normalizer = Normalizer().setCleanupPatterns(['[^a-zA-Z.-]+', '^[^a-zA-Z]+', '[^a-zA-Z]+$',])\
             .setInputCols('lemmas').setOutputCol('normalized').setLowercase(True)

glove = WordEmbeddingsModel.pretrained(name='glove_100d').setInputCols(['document', 'normalized'])\
             .setOutputCol('embeddings')

nlp_pipeline = Pipeline().setStages([assembler, sentence, tokenizer, lemmatizer,
                                     normalizer, glove]).fit(train)

    

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [14]:
# selecting the values, original data, normalized tokens and embeddings
train = nlp_pipeline.transform(train).selectExpr('path', 'text', 'label',
                                                 'normalized.result AS normalized',
                                                 'embeddings.embeddings')

test = nlp_pipeline.transform(test).selectExpr('path', 'text', 'label',
                                               'normalized.result AS normalized',
                                               'embeddings.embeddings')


In [15]:
nlp_pipeline.write().overwrite().save('nlp_pipeline_1')

Creating a simpe version of doc2vec, computing the average of teh word vectors in a document to create a document vector.

In [16]:
import numpy as np
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector, VectorUDT

def avg_wordvecs_fun(wordvecs):
  return DenseVector(np.mean(wordvecs, axis=0))

avg_wordvecs = spark.udf.register('avg_wordvecs', avg_wordvecs_fun, returnType=VectorUDT())

train = train.withColumn('avg_wordvec', avg_wordvecs('embeddings'))
test = test.withColumn('avg_wordvec', avg_wordvecs('embeddings'))
train = train.drop('embeddings')
test = test.drop('ebeddings')
train = train.persist()
test = test.persist()

Featurizing with TF.IDF features

In [17]:
from pyspark.ml.feature import CountVectorizer, IDF

In [18]:
tf = CountVectorizer().setInputCol('normalized').setOutputCol('tf')
idf = IDF().setInputCol('tf').setOutputCol('tdifd')

featurizer = Pipeline().setStages([tf, idf])

Building the model using Logistic Regression

In [19]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [20]:
vec_assembler = VectorAssembler().setInputCols(['avg_wordvec']).setOutputCol('features')
log_reg = LogisticRegression().setFeaturesCol('features').setLabelCol('label')

model_pipeline = Pipeline().setStages([featurizer, vec_assembler, log_reg])

In [21]:
model = model_pipeline.fit(train)

In [22]:
train_preds = model.transform(train)

In [23]:
test_preds = model.transform(test)

# **Model Evaluation**
The model is evaluated computing F1 score on train and test.

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
evaluator = MulticlassClassificationEvaluator().setMetricName('f1')

In [26]:
evaluator.evaluate(train_preds)

0.8025998329204984

In [27]:
evaluator.evaluate(test_preds)

0.8015517696374732