In [2]:
# ch09 自然语言处理
# 切词
df = spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id', 'review'])

In [3]:
df.show(4, False)

+-------+------------------------------------------+
|user_id|review                                    |
+-------+------------------------------------------+
|1      |I really liked this movie                 |
|2      |I would recommend this movie to my friends|
|3      |movie was alright but acting was horrible |
|4      |I am never watching that movie ever again |
+-------+------------------------------------------+



In [4]:
# spark引入Tokenizer
from pyspark.ml.feature import Tokenizer
tokenization = Tokenizer(inputCol = 'review', outputCol = 'tokens')
tokenized_df = tokenization.transform(df)
tokenized_df.show(4, False)

+-------+------------------------------------------+---------------------------------------------------+
|user_id|review                                    |tokens                                             |
+-------+------------------------------------------+---------------------------------------------------+
|1      |I really liked this movie                 |[i, really, liked, this, movie]                    |
|2      |I would recommend this movie to my friends|[i, would, recommend, this, movie, to, my, friends]|
|3      |movie was alright but acting was horrible |[movie, was, alright, but, acting, was, horrible]  |
|4      |I am never watching that movie ever again |[i, am, never, watching, that, movie, ever, again] |
+-------+------------------------------------------+---------------------------------------------------+



In [6]:
# 移除停用词
from pyspark.ml.feature import StopWordsRemover
stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
refined_df = stopword_removal.transform(tokenized_df)
refined_df.select(['user_id', 'tokens', 'refined_tokens']).show(4, False)

+-------+---------------------------------------------------+----------------------------------+
|user_id|tokens                                             |refined_tokens                    |
+-------+---------------------------------------------------+----------------------------------+
|1      |[i, really, liked, this, movie]                    |[really, liked, movie]            |
|2      |[i, would, recommend, this, movie, to, my, friends]|[recommend, movie, friends]       |
|3      |[movie, was, alright, but, acting, was, horrible]  |[movie, alright, acting, horrible]|
|4      |[i, am, never, watching, that, movie, ever, again] |[never, watching, movie, ever]    |
+-------+---------------------------------------------------+----------------------------------+



In [7]:
# 词袋，计数向量器
from pyspark.ml.feature import CountVectorizer
count_vec = CountVectorizer(inputCol='refined_tokens', outputCol = 'features')
cv_df = count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['user_id', 'refined_tokens', 'features']).show(4, False)
# 向量长度11位（词袋大小，one-hot），第一行数据有三个值，位置是0，4，7.

+-------+----------------------------------+--------------------------------+
|user_id|refined_tokens                    |features                        |
+-------+----------------------------------+--------------------------------+
|1      |[really, liked, movie]            |(11,[0,4,7],[1.0,1.0,1.0])      |
|2      |[recommend, movie, friends]       |(11,[0,1,10],[1.0,1.0,1.0])     |
|3      |[movie, alright, acting, horrible]|(11,[0,5,6,9],[1.0,1.0,1.0,1.0])|
|4      |[never, watching, movie, ever]    |(11,[0,2,3,8],[1.0,1.0,1.0,1.0])|
+-------+----------------------------------+--------------------------------+



In [8]:
# 展示词典
count_vec.fit(refined_df).vocabulary

['movie',
 'horrible',
 'liked',
 'really',
 'watching',
 'alright',
 'friends',
 'recommend',
 'ever',
 'never',
 'acting']

In [9]:
# TF-IDF
from pyspark.ml.feature import HashingTF, IDF
hashing_vec = HashingTF(inputCol='refined_tokens', outputCol='tf_features')
hashing_df = hashing_vec.transform(refined_df)
hashing_df.select(['user_id', 'refined_tokens', 'tf_features']).show(4, False)

+-------+----------------------------------+-------------------------------------------------------+
|user_id|refined_tokens                    |tf_features                                            |
+-------+----------------------------------+-------------------------------------------------------+
|1      |[really, liked, movie]            |(262144,[14,32675,155321],[1.0,1.0,1.0])               |
|2      |[recommend, movie, friends]       |(262144,[129613,155321,222394],[1.0,1.0,1.0])          |
|3      |[movie, alright, acting, horrible]|(262144,[80824,155321,236263,240286],[1.0,1.0,1.0,1.0])|
|4      |[never, watching, movie, ever]    |(262144,[63139,155321,203802,245806],[1.0,1.0,1.0,1.0])|
+-------+----------------------------------+-------------------------------------------------------+



In [10]:
# 计算IDF
tf_idf_vec = IDF(inputCol='tf_features', outputCol='tf_idf_features')
tf_idf = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf.select('user_id', 'tf_idf_features').show(4, False)

+-------+----------------------------------------------------------------------------------------------------+
|user_id|tf_idf_features                                                                                     |
+-------+----------------------------------------------------------------------------------------------------+
|1      |(262144,[14,32675,155321],[0.9162907318741551,0.9162907318741551,0.0])                              |
|2      |(262144,[129613,155321,222394],[0.9162907318741551,0.0,0.9162907318741551])                         |
|3      |(262144,[80824,155321,236263,240286],[0.9162907318741551,0.0,0.9162907318741551,0.9162907318741551])|
|4      |(262144,[63139,155321,203802,245806],[0.9162907318741551,0.0,0.9162907318741551,0.9162907318741551])|
+-------+----------------------------------------------------------------------------------------------------+



In [12]:
# 使用机器学习进行文本分类
text_df = spark.read.csv('Movie_reviews.csv', inferSchema=True, header=True, sep=',')
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [13]:
text_df.count()

7087

In [14]:
# 选取被正确标记的记录
text_df = text_df.filter((text_df.Sentiment=='1')| (text_df.Sentiment=='0'))
text_df.count()

6990

In [15]:
text_df.groupBy('Sentiment').count().show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [17]:
from pyspark.sql.functions import rand
text_df.groupby(rand()).count().show(10, False)

+-------------------------+-----+
|rand(3948031753215614132)|count|
+-------------------------+-----+
|0.177254188557608        |1    |
|0.50533733000834         |1    |
|0.48382393572454807      |1    |
|0.18270550609738534      |1    |
|0.5983687619538715       |1    |
|0.5535985762803357       |1    |
|0.5376104088943879       |1    |
|0.47355665865667795      |1    |
|0.11478891688308612      |1    |
|0.9395946717402214       |1    |
+-------------------------+-----+
only showing top 10 rows



In [19]:
text_df = text_df.withColumn("Label", text_df.Sentiment.cast("float")).drop('Sentiment')
text_df.orderBy(rand()).show(10, False)

+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|Is it just me, or does Harry Potter suck?...                            |0.0  |
|and i hate Harry Potter.                                                |0.0  |
|Because I would like to make friends who like the same things I like, an|1.0  |
|Combining the opinion / review from Gary and Gin Zen, The Da Vinci Code |0.0  |
|Which is why i said silent hill turned into reality coz i was hella like|1.0  |
|I, too, like Harry Potter..                                             |1.0  |
|the last stand and Mission Impossible 3 both were awesome movies.       |1.0  |
|Brokeback Mountain was boring.                                          |0.0  |
|meganpenworthy dressed as a character from Harry Potter and the Selfish |0.0  |
|Me, I like the Harry Potter

In [20]:
# 增加一个额外的length列
from pyspark.sql.functions import length
text_df=text_df.withColumn('length', length(text_df['Review']))
text_df.orderBy(rand()).show(10, False)

+------------------------------------------------------------------------+-----+------+
|Review                                                                  |Label|length|
+------------------------------------------------------------------------+-----+------+
|Because I would like to make friends who like the same things I like, an|1.0  |72    |
|man i loved brokeback mountain!                                         |1.0  |31    |
|Then snuck into Brokeback Mountain, which is the most depressing movie I|0.0  |72    |
|I like Mission Impossible movies because you never know who's on the rig|1.0  |72    |
|I either LOVE Brokeback Mountain or think it's great that homosexuality |1.0  |71    |
|"I liked the first "" Mission Impossible."                              |1.0  |42    |
|I hate Harry Potter, it's retarted, gay and stupid and there's only one |0.0  |71    |
|Brokeback Mountain is a beautiful movie..                               |1.0  |41    |
|The Da Vinci Code was absolutel

In [21]:
text_df.groupBy('Label').agg({'Length': 'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



In [22]:
tokenization = Tokenizer(inputCol='Review', outputCol='tokens')
tokenized_df = tokenization.transform(text_df)
stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
refined_text_df = stopword_removal.transform(tokenized_df)

In [23]:
# 捕获评论中的标记数量（分词，去停用词后的数量）
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

In [24]:
len_udf=udf(lambda s:len(s), IntegerType())

In [25]:
# col函数：根据给定的列名返回一个列。
refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))
refined_text_df.orderBy(rand()).show(10)

+--------------------+-----+------+--------------------+--------------------+-----------+
|              Review|Label|length|              tokens|      refined_tokens|token_count|
+--------------------+-----+------+--------------------+--------------------+-----------+
|man i loved broke...|  1.0|    31|[man, i, loved, b...|[man, loved, brok...|          4|
|mission impossibl...|  1.0|    37|[mission, impossi...|[mission, impossi...|          4|
|The Da Vinci Code...|  1.0|    57|[the, da, vinci, ...|[da, vinci, code,...|          7|
|Brokeback Mountai...|  0.0|    30|[brokeback, mount...|[brokeback, mount...|          3|
|I LOVE Harry Pott...|  1.0|    22|[i, love, harry, ...|[love, harry, pot...|          3|
|da vinci code was...|  1.0|    37|[da, vinci, code,...|[da, vinci, code,...|          5|
|Which is why i sa...|  1.0|    72|[which, is, why, ...|[said, silent, hi...|          8|
|The Da Vinci Code...|  0.0|    34|[the, da, vinci, ...|[da, vinci, code,...|          6|
|i loved b

In [32]:
count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
cv_text_df = count_vec.fit(refined_text_df).transform(refined_text_df)
#cv_text_df.head()
cv_text_df.select(['token_count', 'features', 'Label']).show(10, False)

+-----------+----------------------------------------------------------------------------------+-----+
|token_count|features                                                                          |Label|
+-----------+----------------------------------------------------------------------------------+-----+
|5          |(2302,[0,1,4,43,236],[1.0,1.0,1.0,1.0,1.0])                                       |1.0  |
|9          |(2302,[11,51,229,237,275,742,824,1087,1250],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|1.0  |
|5          |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])                                       |1.0  |
|5          |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])                                       |1.0  |
|8          |(2302,[0,1,4,53,655,1339,1427,1449],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])            |1.0  |
|6          |(2302,[46,229,271,1150,1990,2203],[1.0,1.0,1.0,1.0,1.0,1.0])                      |1.0  |
|8          |(2302,[0,1,22,30,111,219,389,535],[1.0,1.0,1.0,1.0,1.0,1.0,1

In [33]:
model_text_df = cv_text_df.select(['features', 'token_count', 'Label'])
# 使用VectorAssembler 创建特征
from pyspark.ml.feature import VectorAssembler
df_assembler = VectorAssembler(inputCols=['features', 'token_count'], outputCol = 'features_vec')
model_text_df=df_assembler.transform(model_text_df)
model_text_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- token_count: integer (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



In [34]:
# 使用LR分类器
from pyspark.ml.classification import LogisticRegression
# 区分训练集
train_df, test_df = model_text_df.randomSplit([0.75, 0.25])
train_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2910|
|  0.0| 2358|
+-----+-----+



In [35]:
test_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  999|
|  0.0|  723|
+-----+-----+



In [36]:
log_reg = LogisticRegression(featuresCol='features_vec', labelCol='Label').fit(train_df)

In [37]:
results=log_reg.evaluate(test_df).predictions
results.show()

+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|            features|token_count|Label|        features_vec|       rawPrediction|         probability|prediction|
+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|(2302,[0,1,4,5,89...|          9|  1.0|(2303,[0,1,4,5,89...|[-16.546253522916...|[6.51708477704275...|       1.0|
|(2302,[0,1,4,5,30...|          5|  1.0|(2303,[0,1,4,5,30...|[-24.003411758523...|[3.76227664317585...|       1.0|
|(2302,[0,1,4,5,44...|          5|  1.0|(2303,[0,1,4,5,44...|[-22.822384754074...|[1.22564370045345...|       1.0|
|(2302,[0,1,4,5,65...|          5|  1.0|(2303,[0,1,4,5,65...|[-15.815520954707...|[1.35333857882513...|       1.0|
|(2302,[0,1,4,5,82...|          6|  1.0|(2303,[0,1,4,5,82...|[-15.915751409526...|[1.22426922994062...|       1.0|
|(2302,[0,1,4,10,1...|         10|  0.0|(2303,[0,1,4,10,1...|[34.1923988502912..

In [40]:
# 评估
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# tp 预测真，真实也真
true_postives = results[(results.Label == 1) & (results.prediction ==1)].count()
# fp 预测真，真实假
false_postives = results[(results.Label == 0) & (results.prediction == 1)].count()
# tn
true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
# fn
false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()

In [41]:
recall = float(true_postives) / (float(true_postives) + float(false_negatives))
print(recall)

0.98998998998999


In [43]:
# 精确度
precision = float(true_postives) / (true_postives + false_postives)
print(precision)

0.9696078431372549


In [44]:
# 准确率
acccuracy = float((true_postives + true_negatives)/ results.count())
print(acccuracy)

0.9761904761904762
