# Creating word vectors using word2vec: implementation in Spark MLlib

## PREPARE DATA: Download data and create input output directories

### Here we show how to use Spark MLlib Word2Vec for generating word-features 
#### The data being used is the attack comments text data

This notebook will take take about 1-2 mins to finish on a Python 3 Spark kernel on a DSVM with Spark

MLlib Word2Vec: https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html#word2vec

## Set directory path for input data 
#### Input data is downloaded locally to a DSVM

In [1]:
# 1. Location of training data on 
text_file = "/home/remoteuser/notebooks/Strata2018/text_classification/text_data/attack_data.csv"
import os
from pathlib import Path
my_file = Path(text_file)

download_file = 1
if my_file.exists():
    download_file = 0

if download_file == 1:
    !wget https://activelearning.blob.core.windows.net/activelearningdemo/text_data.zip
    !unzip text_data.zip

## Set spark context and import necessary libraries

In [2]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession, DataFrame, SparkSession
from pyspark.sql.functions import UserDefinedFunction, regexp_replace, trim, col, lower, lit, udf, monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.feature import Word2Vec, Word2VecModel, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.linalg import Vectors

import numpy as np
import datetime
import pandas as pd

sqlContext = SQLContext(sc)

## Data ingestion: Read in attack text data from .csv file

In [3]:
## READ IN DATA AND CREATE SPARK DATAFRAME FROM A CSV & AND MATERIALIZE IN MEMORY
text_df = spark.read.csv(path=text_file, header=True, inferSchema=True, sep=",")
text_df.cache();

## REGISTER DATA-FRAME AS A TEMP-TABLE IN SQL-CONTEXT
text_df.createOrReplaceTempView("text_table")

# COUNT NUMBER OF ROWS IN DATA FRAME
text_df.count()

115864

In [4]:
# SELECT ONE ROW FROM THE SPARK DATAFRAME
text_df.head(1)

[Row(rev_id=37675, comment="`-NEWLINE_TOKENThis is not ``creative``.  Those are the dictionary definitions of the terms ``insurance`` and ``ensurance`` as properly applied to ``destruction``.  If you don't understand that, fine, legitimate criticism, I'll write up ``three man cell`` and ``bounty hunter`` and then it will be easy to understand why ``ensured`` and ``insured`` are different - and why both differ from ``assured``.NEWLINE_TOKENNEWLINE_TOKENThe sentence you quote is absolutely neutral.  You just aren't familiar with the underlying theory of strike-back (e.g. submarines as employed in nuclear warfare) guiding the insurance, nor likely the three man cell structure that kept the IRA from being broken by the British.  If that's my fault, fine, I can fix that to explain.  But ther'es nothing ``personal`` or ``creative`` about it.NEWLINE_TOKENNEWLINE_TOKENI'm tired of arguing with you.  Re: the other article, ``multi-party`` turns up plenty, and there is more use of ``mutually`` t

In [5]:
### SELECT A FEW COLUMNS BASED ON WHICH WE FILTER
sqlStatement = """ SELECT distinct ns, split 
            FROM text_table 
            where split in ('train', 'test', 'dev') 
            order by ns, split """
spark.sql(sqlStatement).show()

+-------+-----+
|     ns|split|
+-------+-----+
|article|  dev|
|article| test|
|article|train|
+-------+-----+



## Select and filter data set (only training articles)

In [6]:
### SELCT ONLY REV_ID AND COMMENT FIELDS, AND FILTER FOR TRAINING DATA AND ARTILES ONLY
sqlStatement = """ SELECT rev_id, comment 
            FROM text_table 
            where ns = 'article' and split = 'train' """
text_filtered_df = spark.sql(sqlStatement)

## CACHE NEW DATAFRAME IN MEMORY AND CREATE TEMPORARY TABLE
text_filtered_df.cache(); 
text_filtered_df.createOrReplaceTempView("text_filtered_table")

## COUNT NUMBER OF ROWS IN DATAFRAME AFTER FILTERING
text_filtered_df.count()

31253

## Lowercase COMMENT and remove some irrelevant words and punctuations

In [7]:
text_filtered_df2 = text_filtered_df.withColumn("comment1", lower(col("comment"))).\
    withColumn("comment2", regexp_replace("comment1", '-newline_token', "")).\
    withColumn("comment3", regexp_replace("comment2", 'newline_token', "")).\
    withColumn("comment_final", regexp_replace("comment3", '[^\w-_ ]', "")).\
    select('rev_id', 'comment_final')
    

# SELECT ONE ROW OF DATAFRAME AFTER 
text_filtered_df2.head(1)

[Row(rev_id=37675, comment_final='this is not creative  those are the dictionary definitions of the terms insurance and ensurance as properly applied to destruction  if you dont understand that fine legitimate criticism ill write up three man cell and bounty hunter and then it will be easy to understand why ensured and insured are different - and why both differ from assuredthe sentence you quote is absolutely neutral  you just arent familiar with the underlying theory of strike-back eg submarines as employed in nuclear warfare guiding the insurance nor likely the three man cell structure that kept the ira from being broken by the british  if thats my fault fine i can fix that to explain  but theres nothing personal or creative about itim tired of arguing with you  re the other article multi-party turns up plenty and there is more use of mutually than mutual  if i were to apply your standard id be moving mutual assured destruction to talk for not appealing to a reagan voters biases abo

## Tokenize COMMENT and remove stopwords

In [8]:
## DEFINE TOKENIZER AND STOPWORD REMOVER
tokenizer = Tokenizer(inputCol="comment_final", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtWords")

## TRANSFORM DATASET TO TOKENIZE AND REMOVE STOPWORDS
text_filtered_df3 = tokenizer.transform(text_filtered_df2)
text_filtered_df4 = remover.transform(text_filtered_df3)

## MATERIALIZE DATAFRAME IN MEMORY
text_filtered_df4.cache(); text_filtered_df4.count();
text_filtered_df4.show(5)

+------+--------------------+--------------------+--------------------+
|rev_id|       comment_final|               words|           filtWords|
+------+--------------------+--------------------+--------------------+
| 37675|this is not creat...|[this, is, not, c...|[creative, , dict...|
| 44816| the term standar...|[, the, term, sta...|[, term, standard...|
| 49851|true or false the...|[true, or, false,...|[true, false, sit...|
| 93890|this page will ne...|[this, page, will...|[page, need, disa...|
|103624|i removed the fol...|[i, removed, the,...|[removed, followi...|
+------+--------------------+--------------------+--------------------+
only showing top 5 rows



## DEFINE AND RUN WORD2VEC ON COMMENTS

MLlib Word2Vec parameters: https://spark.apache.org/docs/2.2.0/api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec

In [9]:
model = None
window_size = 5
vector_size = 50
min_count = 5

## DEFINE WORD2VEC TRANSFORMER
word2Vec = Word2Vec(windowSize = window_size, vectorSize = vector_size, minCount = min_count, inputCol="filtWords", outputCol="result")

## FIT TRANSFORMER TO GENERATE FEATURES
model = word2Vec.fit(text_filtered_df4)

## Examine some words, and other words close to them from these feature neighborhood

In [10]:
model.findSynonyms("good", 10).select("word").head(3)

[Row(word='wikipediaassume'), Row(word='bad'), Row(word='assuming')]

## Examine how the vector features look like

In [11]:
word2vec_features = model.getVectors().select("*")
word2vec_features.head(1)

[Row(word='quotient', vector=DenseVector([0.0241, 0.0267, 0.0303, 0.0178, 0.0267, -0.0065, 0.0194, -0.015, -0.0127, 0.0356, -0.0137, -0.0397, -0.0015, -0.0369, 0.0073, 0.0071, 0.0129, -0.0115, -0.0495, 0.0116, 0.0009, 0.0001, -0.0196, 0.0128, 0.001, -0.0275, -0.0035, -0.0336, -0.0086, -0.019, 0.0171, 0.0197, 0.0243, -0.0021, -0.0106, 0.0371, -0.0165, -0.0146, -0.0174, -0.0169, -0.0242, 0.0485, 0.023, 0.0443, -0.0264, -0.023, -0.0117, 0.0207, -0.0054, -0.0321]))]

## Convert Spark DF to Pandas DF for output into a CSV file

In [12]:
word2vec_features_pdf = word2vec_features.toPandas()
word2vec_features_pdf.head(3)

Unnamed: 0,word,vector
0,quotient,"[0.0241195745766, 0.0266819447279, 0.030338654..."
1,incident,"[-0.0359971560538, -0.0717884227633, -0.019297..."
2,serious,"[-0.119405440986, 0.139066547155, -0.005218368..."


## Get comment-level vectors from word-level vectors (averaging)

In [13]:
comment_vectors_df = model.transform(text_filtered_df4).select('rev_id','result')
comment_vectors_df.head(2)

[Row(rev_id=37675, result=DenseVector([-0.0716, 0.0201, -0.0047, -0.0101, 0.0163, 0.0353, -0.0321, 0.0066, 0.0358, -0.0455, 0.0365, 0.0258, 0.0202, -0.0069, -0.0129, -0.0126, 0.0236, -0.0205, 0.0011, -0.0335, -0.0634, -0.0498, -0.0149, 0.0159, 0.032, -0.0405, -0.0664, -0.0454, 0.0159, -0.0056, 0.0004, -0.0303, -0.0111, -0.0887, 0.0152, 0.019, -0.0322, 0.0274, -0.011, 0.0521, 0.0024, -0.0014, 0.0465, 0.0481, -0.0293, 0.0144, 0.0154, -0.0254, -0.0259, -0.0688])),
 Row(rev_id=44816, result=DenseVector([-0.0683, 0.01, 0.0187, -0.0095, 0.036, 0.0405, -0.0142, -0.0076, 0.0356, -0.0509, 0.0273, 0.048, -0.0024, 0.0093, 0.0008, 0.0045, 0.0187, -0.012, -0.0089, -0.0321, -0.0453, -0.0645, -0.0229, -0.0065, 0.0432, -0.0698, -0.0519, -0.0344, -0.0015, 0.016, -0.004, -0.0377, -0.0139, -0.1092, 0.0075, 0.0397, -0.0181, 0.0025, -0.0269, 0.0353, 0.0087, -0.0119, 0.0386, 0.0674, -0.0193, 0.0199, -0.0117, -0.0308, -0.0277, -0.0986]))]

## SAVE FEATURES in CSV file for subsequent steps

In [14]:
word2vec_features_pdf.to_csv("/home/remoteuser/notebooks/Strata2018/text_classification/Word2Vec-Features.csv")