In [1]:
#创建SparkSession对象
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
df=spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id','review'])

In [5]:
df.show(10,False)

+-------+------------------------------------------+
|user_id|review                                    |
+-------+------------------------------------------+
|1      |I really liked this movie                 |
|2      |I would recommend this movie to my friends|
|3      |movie was alright but acting was horrible |
|4      |I am never watching that movie ever again |
+-------+------------------------------------------+



### 标记

In [6]:
from pyspark.ml.feature import Tokenizer

In [7]:
tokenization=Tokenizer(inputCol='review',outputCol='tokens')
tokenized_df=tokenization.transform(df)

In [8]:
tokenized_df.show(5,False)

+-------+------------------------------------------+---------------------------------------------------+
|user_id|review                                    |tokens                                             |
+-------+------------------------------------------+---------------------------------------------------+
|1      |I really liked this movie                 |[i, really, liked, this, movie]                    |
|2      |I would recommend this movie to my friends|[i, would, recommend, this, movie, to, my, friends]|
|3      |movie was alright but acting was horrible |[movie, was, alright, but, acting, was, horrible]  |
|4      |I am never watching that movie ever again |[i, am, never, watching, that, movie, ever, again] |
+-------+------------------------------------------+---------------------------------------------------+



### 停用词

In [9]:
from pyspark.ml.feature import StopWordsRemover

In [10]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [11]:
refined_df=stopword_removal.transform(tokenized_df)

In [12]:
refined_df.select(['user_id','tokens','refined_tokens']).show(10,False)

+-------+---------------------------------------------------+----------------------------------+
|user_id|tokens                                             |refined_tokens                    |
+-------+---------------------------------------------------+----------------------------------+
|1      |[i, really, liked, this, movie]                    |[really, liked, movie]            |
|2      |[i, would, recommend, this, movie, to, my, friends]|[recommend, movie, friends]       |
|3      |[movie, was, alright, but, acting, was, horrible]  |[movie, alright, acting, horrible]|
|4      |[i, am, never, watching, that, movie, ever, again] |[never, watching, movie, ever]    |
+-------+---------------------------------------------------+----------------------------------+



### 计数向量器

In [13]:
from pyspark.ml.feature import CountVectorizer

In [14]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
#cv_df=count_vec.fit(refined_df).transform(refined_df)

In [22]:
cv_df=count_vec.fit(refined_df)

In [23]:
cv_df.vocabulary

['movie',
 'horrible',
 'liked',
 'alright',
 'friends',
 'recommend',
 'acting',
 'never',
 'really',
 'watching',
 'ever']

In [24]:
cv_df.transform(refined_df).select(['user_id','refined_tokens','features']).show(10,False)

+-------+----------------------------------+---------------------------------+
|user_id|refined_tokens                    |features                         |
+-------+----------------------------------+---------------------------------+
|1      |[really, liked, movie]            |(11,[0,2,8],[1.0,1.0,1.0])       |
|2      |[recommend, movie, friends]       |(11,[0,4,5],[1.0,1.0,1.0])       |
|3      |[movie, alright, acting, horrible]|(11,[0,1,3,6],[1.0,1.0,1.0,1.0]) |
|4      |[never, watching, movie, ever]    |(11,[0,7,9,10],[1.0,1.0,1.0,1.0])|
+-------+----------------------------------+---------------------------------+



### TF-IDF

In [25]:
from pyspark.ml.feature import HashingTF,IDF

In [38]:
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features', numFeatures=100)

In [39]:
hashing_df=hashing_vec.transform(refined_df)

In [40]:
hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,False)

+-------+----------------------------------+-------------------------------------+
|user_id|refined_tokens                    |tf_features                          |
+-------+----------------------------------+-------------------------------------+
|1      |[really, liked, movie]            |(100,[12,39,88],[1.0,1.0,1.0])       |
|2      |[recommend, movie, friends]       |(100,[16,39,99],[1.0,1.0,1.0])       |
|3      |[movie, alright, acting, horrible]|(100,[5,23,39,66],[1.0,1.0,1.0,1.0]) |
|4      |[never, watching, movie, ever]    |(100,[39,75,81,94],[1.0,1.0,1.0,1.0])|
+-------+----------------------------------+-------------------------------------+



In [41]:
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')

In [42]:
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)

In [43]:
tf_idf_df.select(['user_id','tf_idf_features']).show(4,False)

+-------+----------------------------------------------------------------------------------+
|user_id|tf_idf_features                                                                   |
+-------+----------------------------------------------------------------------------------+
|1      |(100,[12,39,88],[0.9162907318741551,0.0,0.9162907318741551])                      |
|2      |(100,[16,39,99],[0.9162907318741551,0.0,0.9162907318741551])                      |
|3      |(100,[5,23,39,66],[0.9162907318741551,0.9162907318741551,0.0,0.9162907318741551]) |
|4      |(100,[39,75,81,94],[0.0,0.9162907318741551,0.9162907318741551,0.9162907318741551])|
+-------+----------------------------------------------------------------------------------+



## 使用机器学习进行分类

In [44]:
text_df=spark.read.csv('Movie_reviews.csv',inferSchema=True,header=True,sep=',')

In [45]:
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [46]:
print(text_df.count(),len(text_df.columns))

7087 2


In [47]:
from pyspark.sql.functions import rand 

In [48]:
text_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+---------+
|Review                                                                  |Sentiment|
+------------------------------------------------------------------------+---------+
|I like Mission Impossible movies because you never know who's on the rig|1        |
|Brokeback Mountain is a beautiful movie...                              |1        |
|I love Brokeback Mountain.                                              |1        |
|we're gonna like watch Mission Impossible or Hoot.(                     |1        |
|Now, I am the first person to say that The Da Vinci Code sucks, but hell|0        |
|the da vinci code is awesome!                                           |1        |
|loved the preview for mission impossible III.                           |1        |
|Then snuck into Brokeback Mountain, which is the most depressing movie I|0        |
|Always knows what I want, not guy crazy, hates Harry Potter..   

In [49]:
text_df.groupBy("Sentiment").count().show()

+--------------------+-----+
|           Sentiment|count|
+--------------------+-----+
|                  ,0|    1|
|. but "" Angel an...|    1|
|                   0| 3081|
| "" you see Demen...|    1|
| but due to the s...|    1|
| the story of "" ...|    1|
| and not because ...|    1|
|            oddly e"|    1|
|   but I still feel"|    1|
|              my God|    1|
| I decided to wri...|    1|
| but it was reall...|    1|
|  but I hate the Da"|    1|
|                   1| 3909|
| but immensely we...|    1|
|             with f"|    1|
|               also"|   80|
|      or how I love"|    1|
|                 Joe|    1|
| which was really...|    1|
+--------------------+-----+



数据筛选

In [50]:
text_df=text_df.filter(((text_df.Sentiment =='1') | (text_df.Sentiment =='0')))

In [51]:
print((text_df.count(),len(text_df.columns)))

(6990, 2)


In [52]:
text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')

In [53]:
text_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|i heard da vinci code sucked soo much only 2.5 stars:                   |0.0  |
|friday hung out with kelsie and we went and saw The Da Vinci Code SUCKED|0.0  |
|I hate it though, because I really like his Mission Impossible films, so|1.0  |
|And they all involved Harry Potter * is lame *..                        |0.0  |
|Brokeback Mountain was boring.                                          |0.0  |
|I hate Harry Potter.                                                    |0.0  |
|The Da Vinci Code was absolutely AWESOME!                               |1.0  |
|i just love Da Vinci Code so much!                                      |1.0  |
|Brokeback Mountain was boring.                                          |0.0  |
|As I sit here, watching the

### 数据标记并去除通用词

In [54]:
tokenization=Tokenizer(inputCol='Review',outputCol='tokens')

In [55]:
tokenized_df=tokenization.transform(text_df)

In [58]:
tokenized_df.show()

+--------------------+-----+--------------------+
|              Review|Label|              tokens|
+--------------------+-----+--------------------+
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|this was the firs...|  1.0|[this, was, the, ...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|
|I liked the Da Vi...|  1.0|[i, liked, the, d...|
|that's not even a...|  1.0|[that's, not, eve...|
|I loved the Da Vi...|  1.0|[i, loved, the, d...|
|i thought da vinc...|  1.0|[i, thought, da, ...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|I thought the Da ...|  1.0|[i, thought, the,...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|then I turn on th...|  1.0|[then, i, turn, o...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|i love da vinci c...|  1.0|[i, love, da, vin...|
|i loved da vinci ...|  1.0|[i, loved, da, vi...|
|TO NIGHT:: THE DA...|  1.0|[to, night::, the...|


In [59]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [60]:
refined_text_df=stopword_removal.transform(tokenized_df)

In [61]:
refined_text_df.show()

+--------------------+-----+--------------------+--------------------+
|              Review|Label|              tokens|      refined_tokens|
+--------------------+-----+--------------------+--------------------+
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[da, vinci, code,...|
|this was the firs...|  1.0|[this, was, the, ...|[first, clive, cu...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|[liked, da, vinci...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|[liked, da, vinci...|
|I liked the Da Vi...|  1.0|[i, liked, the, d...|[liked, da, vinci...|
|that's not even a...|  1.0|[that's, not, eve...|[even, exaggerati...|
|I loved the Da Vi...|  1.0|[i, loved, the, d...|[loved, da, vinci...|
|i thought da vinc...|  1.0|[i, thought, da, ...|[thought, da, vin...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[da, vinci, code,...|
|I thought the Da ...|  1.0|[i, thought, the,...|[thought, da, vin...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[da, vinci, code,...|
|The D

### 计数向量器

In [62]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
cv_text_df=count_vec.fit(refined_text_df).transform(refined_text_df)

In [64]:
cv_text_df.select(['refined_tokens','features','Label']).show(10)

+--------------------+--------------------+-----+
|      refined_tokens|            features|Label|
+--------------------+--------------------+-----+
|[da, vinci, code,...|(2302,[0,1,4,43,2...|  1.0|
|[first, clive, cu...|(2302,[11,51,229,...|  1.0|
|[liked, da, vinci...|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|(2302,[0,1,4,53,6...|  1.0|
|[even, exaggerati...|(2302,[46,229,271...|  1.0|
|[loved, da, vinci...|(2302,[0,1,22,30,...|  1.0|
|[thought, da, vin...|(2302,[0,1,4,228,...|  1.0|
|[da, vinci, code,...|(2302,[0,1,4,33,2...|  1.0|
|[thought, da, vin...|(2302,[0,1,4,223,...|  1.0|
+--------------------+--------------------+-----+
only showing top 10 rows



### 建模数据

In [65]:
model_text_df=cv_text_df.select(['features','Label'])

In [66]:
from pyspark.ml.feature import VectorAssembler

In [67]:
df_assembler = VectorAssembler(inputCols=['features'],outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)

In [68]:
model_text_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



### 逻辑回归建模

In [69]:
from pyspark.ml.classification import LogisticRegression

In [70]:
training_df,test_df=model_text_df.randomSplit([0.75,0.25])

In [71]:
log_reg=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)

In [72]:
results=log_reg.evaluate(test_df).predictions

In [73]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [74]:
accuracy=MulticlassClassificationEvaluator(labelCol='Label',metricName='accuracy').evaluate(results)

In [75]:
accuracy

0.9775219298245614