In [1]:
from pyspark.sql.functions import rand 
from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StopWordsRemover

In [2]:
spark=SparkSession.builder.appName('word2vec').getOrCreate()

### 导入数据

In [3]:
text_df=spark.read.csv('Movie_reviews.csv',inferSchema=True,header=True,sep=',')

In [4]:
#数据清洗
text_df=text_df.filter(((text_df.Sentiment =='1') | (text_df.Sentiment =='0')))

In [5]:
print((text_df.count(),len(text_df.columns)))

(6990, 2)


In [6]:
#标签转换
text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')

In [15]:
text_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|by the way, the Da Vinci Code sucked, just letting you know...          |0.0  |
|Brokeback Mountain was an AWESOME movie.                                |1.0  |
|DA VINCI CODE IS AWESOME!!                                              |1.0  |
|the people who are worth it know how much i love the da vinci code.     |1.0  |
|Brokeback Mountain is fucking horrible..                                |0.0  |
|I love Brokeback Mountain.                                              |1.0  |
|Mission Impossible 3 was excellent.                                     |1.0  |
|mission impossible 2 rocks!!....                                        |1.0  |
|Murderball-Immediately after Crash won the Oscar, there was a lot of bul|0.0  |
|I heard Da Vinci Code sucke

### 数据清洗

In [9]:
from pyspark.sql.functions import regexp_replace,col

In [29]:
df=text_df.withColumn('Review', regexp_replace(col('Review'), "\\.", "")).withColumn('Review', regexp_replace(col('Review'), "\\!", ""))

In [30]:
df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|As I sit here, watching the MTV Movie Awards, I am reminded of how much |0.0  |
|I am a Christian and I absolutely HATE the Da Vinci Code                |0.0  |
|* Mission Impossible III is an absolutely awesome possum movie          |1.0  |
|I, too, like Harry Potter                                               |1.0  |
|I hated The Da Vinci Code                                               |0.0  |
|I love The Da Vinci Code                                                |1.0  |
|I am going to start reading the Harry Potter series again because that i|1.0  |
|The Da Vinci Code was awesome, I can't wait to read it                  |1.0  |
|Because I would like to make friends who like the same things I like, an|1.0  |
|Always knows what I want, n

### 数据标记

In [31]:
tokenization=Tokenizer(inputCol='Review',outputCol='tokens')
tokenized_df=tokenization.transform(df)

In [32]:
tokenized_df.show()

+--------------------+-----+--------------------+
|              Review|Label|              tokens|
+--------------------+-----+--------------------+
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|this was the firs...|  1.0|[this, was, the, ...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|
|I liked the Da Vi...|  1.0|[i, liked, the, d...|
|that's not even a...|  1.0|[that's, not, eve...|
|I loved the Da Vi...|  1.0|[i, loved, the, d...|
|i thought da vinc...|  1.0|[i, thought, da, ...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|I thought the Da ...|  1.0|[i, thought, the,...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|then I turn on th...|  1.0|[then, i, turn, o...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|
|i love da vinci code|  1.0|[i, love, da, vin...|
|i loved da vinci ...|  1.0|[i, loved, da, vi...|
|TO NIGHT:: THE DA...|  1.0|[to, night::, the...|


### 创建Word2Vec

In [36]:
word2Vec = Word2Vec(vectorSize=20, minCount=0, inputCol="tokens", outputCol="Vec")
model = word2Vec.fit(tokenized_df)

### 查看向量

In [37]:
vectors = model.getVectors()
vectors.show()

+----------+--------------------+
|      word|              vector|
+----------+--------------------+
| forgotten|[0.06974968314170...|
|   speaker|[0.02460335753858...|
|  terrible|[0.08723274618387...|
|     mpreg|[-0.0167238153517...|
|     looks|[0.07572395354509...|
|   firstly|[0.02391334623098...|
|      movi|[0.00823700334876...|
|  scenario|[-0.0231255926191...|
|     ideas|[-0.0085466019809...|
|    esther|[0.01806709542870...|
|      used|[-0.0273626651614...|
|       eye|[0.00155626796185...|
|     bikes|[0.01246209908276...|
| reference|[-0.0011805207468...|
| beautiful|[0.11400406062602...|
|"christmas|[0.00261365668848...|
|    playin|[0.02212674915790...|
|    sunday|[-0.0196329616010...|
|     funny|[-0.0205662474036...|
| precious,|[-0.0075996569357...|
+----------+--------------------+
only showing top 20 rows



In [38]:
result = model.transform(tokenized_df)
result.show()

+--------------------+-----+--------------------+--------------------+
|              Review|Label|              tokens|                 Vec|
+--------------------+-----+--------------------+--------------------+
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[-0.4523605648428...|
|this was the firs...|  1.0|[this, was, the, ...|[-0.0664692324732...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|[-0.3672762550413...|
|i liked the Da Vi...|  1.0|[i, liked, the, d...|[-0.3672762550413...|
|I liked the Da Vi...|  1.0|[i, liked, the, d...|[-0.2843162438521...|
|that's not even a...|  1.0|[that's, not, eve...|[0.02557325368564...|
|I loved the Da Vi...|  1.0|[i, loved, the, d...|[-0.1331588972492...|
|i thought da vinc...|  1.0|[i, thought, da, ...|[-0.4282118374566...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[-0.3809949652188...|
|I thought the Da ...|  1.0|[i, thought, the,...|[-0.3003517504442...|
|The Da Vinci Code...|  1.0|[the, da, vinci, ...|[-0.3208075933424...|
|The D

### 构建模型数据

In [40]:
model_text_df=result.select(['Vec','Label'])

In [41]:
df_assembler = VectorAssembler(inputCols=['Vec'],outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)

In [42]:
model_text_df.printSchema()

root
 |-- Vec: vector (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



### 建模预测

In [43]:
training_df,test_df=model_text_df.randomSplit([0.75,0.25])
log_reg=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)
results=log_reg.evaluate(test_df).predictions

In [44]:
accuracy=MulticlassClassificationEvaluator(labelCol='Label',metricName='accuracy').evaluate(results)

In [45]:
accuracy

0.9440885264997088

## 调参

In [50]:
def pipeline(i,model,data):
    word2Vec_2 = Word2Vec(vectorSize=i, minCount=0, inputCol="tokens", outputCol="result")
    result = word2Vec_2.fit(data).transform(data)
    model_text_df=result.select(['result','Label'])
    df_assembler = VectorAssembler(inputCols=['result'],outputCol='features_vec')
    model_text_df = df_assembler.transform(model_text_df)
    training_df,test_df=model_text_df.randomSplit([0.75,0.25])
    log_reg=model(featuresCol='features_vec',labelCol='Label').fit(training_df)
    results=log_reg.evaluate(test_df).predictions
    accuracy=MulticlassClassificationEvaluator(labelCol='Label',metricName='accuracy').evaluate(results)
    print(i,accuracy)

In [51]:
for i in range(20,101,10):
    pipeline(i,LogisticRegression,tokenized_df)

20 0.9556926528323051
30 0.952755905511811
40 0.9634626194491287
50 0.9592439456585942
60 0.9554540262707024
70 0.953016241299304
80 0.968271954674221
90 0.9595278246205734
100 0.9607508532423208


In [49]:
for i in range(20,101,10):
    word2Vec_2 = Word2Vec(vectorSize=i, minCount=0, inputCol="tokens", outputCol="result")
    result = word2Vec_2.fit(tokenized_df).transform(tokenized_df)
    model_text_df=result.select(['result','Label'])
    df_assembler = VectorAssembler(inputCols=['result'],outputCol='features_vec')
    model_text_df = df_assembler.transform(model_text_df)
    training_df,test_df=model_text_df.randomSplit([0.75,0.25])
    log_reg=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)
    results=log_reg.evaluate(test_df).predictions
    accuracy=MulticlassClassificationEvaluator(labelCol='Label',metricName='accuracy').evaluate(results)
    print(i,accuracy)

20 0.9552322327923894
30 0.9589595375722544
40 0.96
50 0.9654566744730679
60 0.961560203504805
70 0.9585253456221198
80 0.964712578258395
90 0.9631171921475312
100 0.9514779698828778


### 停用词

In [57]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)

In [60]:
def pipeline2(i,model,data):
    word2Vec_2 = Word2Vec(vectorSize=i, minCount=0, inputCol="refined_tokens", outputCol="result")
    result = word2Vec_2.fit(data).transform(data)
    model_text_df=result.select(['result','Label'])
    df_assembler = VectorAssembler(inputCols=['result'],outputCol='features_vec')
    model_text_df = df_assembler.transform(model_text_df)
    training_df,test_df=model_text_df.randomSplit([0.75,0.25])
    log_reg=model(featuresCol='features_vec',labelCol='Label').fit(training_df)
    results=log_reg.evaluate(test_df).predictions
    accuracy=MulticlassClassificationEvaluator(labelCol='Label',metricName='accuracy').evaluate(results)
    print(i,accuracy)

In [65]:
for i in range(20,101,10):
    pipeline(i,LogisticRegression,refined_df)

20 0.9464594127806563
30 0.9586636466591166
40 0.9636982416335791
50 0.9536878216123499
60 0.9547255234861347
70 0.9580941446613088
80 0.952018724400234
90 0.95475910693302
100 0.9651094027202839
