## Import modules and create spark session

In [1]:
#import modules
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

#create Spark session
appName = "Sentiment Analysis in Spark"
conf = (SparkConf().setAppName("appName"))

sc = SparkContext.getOrCreate()
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

23/01/19 19:30:55 WARN Utils: Your hostname, mecha resolves to a loopback address: 127.0.1.1; using 192.168.1.161 instead (on interface wlp8s0)
23/01/19 19:30:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/19 19:30:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read data file into Spark dataFrame

In [2]:
#read csv file into dataFrame with automatically inferred schema
schema = StructType([
    StructField("Target", IntegerType(), True),
    StructField("ID", IntegerType(), True),
    StructField("Date", StringType(), True),
    StructField("Query", StringType(), True),
    StructField("User", StringType(), True),
    StructField("Text", StringType(), True)])

tweets_csv = spark.read.csv('dataset/tweets.csv', schema=schema, header=False)
tweets_csv.show(truncate=False, n=3)

+------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|Target|ID        |Date                        |Query   |User           |Text                                                                                                               |
+------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|0     |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|0     |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|0     |1467810917|Mon Apr 06 22:19:53 PDT 2009|NO

                                                                                

## Select the related data

In [3]:
#select only "SentimentText" and "Sentiment" column, 
#and cast "Sentiment" column data into integer
data = tweets_csv.select("Text", col("Target").alias("label"))
data.show(truncate = False,n=5)

+-------------------------------------------------------------------------------------------------------------------+-----+
|Text                                                                                                               |label|
+-------------------------------------------------------------------------------------------------------------------+-----+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|0    |
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |0    |
|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |0    |
|my whole body feels itchy and like its on fire                                                                     |0    |
|@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.     |0    |
+-------

## Trim White Space 

In [4]:
from pyspark.sql.functions import trim
data = data.withColumn("Text", trim(data.Text))
data.show(truncate=False, n=5)

+-------------------------------------------------------------------------------------------------------------------+-----+
|Text                                                                                                               |label|
+-------------------------------------------------------------------------------------------------------------------+-----+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|0    |
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |0    |
|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |0    |
|my whole body feels itchy and like its on fire                                                                     |0    |
|@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.     |0    |
+-------

## Divide data into training and testing data

In [5]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)



Training data rows: 1119940 ; Testing data rows: 480060


                                                                                

## Prepare training data

Separate "SentimentText" into individual words using tokenizer

In [6]:
tokenizer = Tokenizer(inputCol="Text", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False, n=5)

[Stage 9:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                       |label|SentimentWords                                                                                                                                     |
+---------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|!! @JordanisCreativ ...nice seeing you out! too bad we didn't get to chat!                                                 |0    |[!!, @jordaniscreativ, ...nic

                                                                                

Removing stop words (unimportant words to be features)

In [7]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

[Stage 10:>                                                         (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                       |label|SentimentWords                                                                                                                                     |MeaningfulWords                                                                                                  |
+---------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------------------------------

                                                                                

Converting words feature into numerical feature. In Spark 2.2.1,it is implemented in HashingTF funtion using Austin Appleby's MurmurHash 3 algorithm

In [8]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=False, n=3)

[Stage 11:>                                                         (0 + 1) / 1]

+-----+-----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|MeaningfulWords                                                                                                  |features                                                                                                                                                                     |
+-----+-----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[!!, @jordaniscreativ, ...nice, seeing, out!, bad, get, chat!]                                           

                                                                                

## Train our classifier model using training data

In [9]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

                                                                                

23/01/19 19:31:37 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

23/01/19 19:31:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/01/19 19:31:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

Training is done!


## Save the Model

In [10]:
basePath = "/home/mecha/Documents/ml_models/sentiment_analyzer"
model.save(basePath + "/modeliter11Demo")


23/01/19 19:32:51 WARN TaskSetManager: Stage 37 contains a task of very large size (10455 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

## Prepare testing data

In [11]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)


[Stage 40:>                                                         (0 + 1) / 1]

+-----+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|Label|MeaningfulWords                                                                                                |features                                                                                                              |
+-----+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|0    |[!@#$, tomorrow's, monday]                                                                                     |(262144,[194004,222966,229020],[1.0,1.0,1.0])                                                                         |
|0    |[#, nyc, celebrity, street, vendors, 

                                                                                

## Predict testing data and calculate the accuracy model

In [12]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")
predictionFinal.show(n=4, truncate = False)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

23/01/19 19:32:55 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB


                                                                                

+---------------------------------------------------------------------------------------------------------------+----------+-----+
|MeaningfulWords                                                                                                |prediction|Label|
+---------------------------------------------------------------------------------------------------------------+----------+-----+
|[!@#$, tomorrow's, monday]                                                                                     |0.0       |0    |
|[#, nyc, celebrity, street, vendors, &gt;, #, poa, celebrities, , http://streetvendor.org/media/pdfs/side2.pdf]|4.0       |0    |
|[#3breakupwords, still, love, dumb, ass!]                                                                      |0.0       |0    |
|[#3turnoffwords, &quot;scary, movie, 5&quot;]                                                                  |4.0       |0    |
+----------------------------------------------------------------------------------



correct prediction: 348192 , total data: 480060 , accuracy: 0.7253093363329584


                                                                                