# Twitter Sentiment Classifier

In [1]:
import org.apache.spark.sql.types._

val schema = StructType("ItemID Sentiment SentimentText"
                        .split(" ")
                        .map(fieldName => {
                            if (fieldName == "ItemID" || fieldName == "Sentiment")
                                StructField(fieldName, IntegerType, nullable = false)
                            else
                                StructField(fieldName, StringType, nullable = false)
                        }))

val data = spark.read.format("csv")
    .schema(schema)
    .option("header", "true")
    .load("twitter_sentiment_data.csv")

Intitializing Scala interpreter ...

Spark Web UI available at http://0d5d40a6a37e:4040
SparkContext available as 'sc' (version = 3.2.0, master = local[*], app id = local-1636943122570)
SparkSession available as 'spark'


import org.apache.spark.sql.types._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(ItemID,IntegerType,false), StructField(Sentiment,IntegerType,false), StructField(SentimentText,StringType,false))
data: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 1 more field]


In [2]:
data.show()

+------+---------+--------------------+
|ItemID|Sentiment|       SentimentText|
+------+---------+--------------------+
|     1|        0|                 ...|
|     2|        0|                 ...|
|     3|        1|              omg...|
|     4|        0|          .. Omga...|
|     5|        0|         i think ...|
|     6|        0|         or i jus...|
|     7|        1|       Juuuuuuuuu...|
|     8|        0|       Sunny Agai...|
|     9|        1|      handed in m...|
|    10|        1|      hmmmm.... i...|
|    11|        0|      I must thin...|
|    12|        1|      thanks to a...|
|    13|        0|      this weeken...|
|    14|        0|     jb isnt show...|
|    15|        0|     ok thats it ...|
|    16|        0|    &lt;-------- ...|
|    17|        0|    awhhe man.......|
|    18|        1|    Feeling stran...|
|    19|        0|    HUGE roll of ...|
|    20|        0|    I just cut my...|
+------+---------+--------------------+
only showing top 20 rows



In [3]:
data.schema

res1: org.apache.spark.sql.types.StructType = StructType(StructField(ItemID,IntegerType,true), StructField(Sentiment,IntegerType,true), StructField(SentimentText,StringType,true))


### Transform Tweets

In [4]:
import org.apache.spark.sql.functions.udf

val dropRepetitive = udf{ str: String => str.replaceAll("((.))\\1+","$1").trim.toLowerCase()}
val noRepetitiveCharsData = data.withColumn("Collapsed", dropRepetitive('SentimentText))

import org.apache.spark.sql.functions.udf
dropRepetitive: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$3743/0x000000084156f840@14b98219,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
noRepetitiveCharsData: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 2 more fields]


In [5]:
noRepetitiveCharsData.show()

+------+---------+--------------------+--------------------+
|ItemID|Sentiment|       SentimentText|           Collapsed|
+------+---------+--------------------+--------------------+
|     1|        0|                 ...|is so sad for my ...|
|     2|        0|                 ...|i mised the new m...|
|     3|        1|              omg...|omg its already 7...|
|     4|        0|          .. Omga...|. omgaga. im so i...|
|     5|        0|         i think ...|i think mi bf is ...|
|     6|        0|         or i jus...|or i just wory to...|
|     7|        1|       Juuuuuuuuu...|        just chilin!|
|     8|        0|       Sunny Agai...|suny again work t...|
|     9|        1|      handed in m...|handed in my unif...|
|    10|        1|      hmmmm.... i...|hm. i wonder how ...|
|    11|        0|      I must thin...|i must think abou...|
|    12|        1|      thanks to a...|thanks to al the ...|
|    13|        0|      this weeken...|this wekend has s...|
|    14|        0|     j

### Tokenize Tweets

In [6]:
import org.apache.spark.ml.feature.{HashingTF, IDF, RegexTokenizer}

// create processing stages
val tokenizer = new RegexTokenizer().setInputCol("Collapsed")
                                    .setOutputCol("tokens")
                                    .setPattern("\\s+")
val hashingTF = new HashingTF().setInputCol("tokens")
                               .setOutputCol("tf")
                               .setNumFeatures(200000)
val idf = new IDF().setInputCol("tf").setOutputCol("tfidf")

// tokenize and compute tf
val tokenized = tokenizer.transform(noRepetitiveCharsData)
val tf = hashingTF.transform(tokenized)

// train IDF transformer
val idfModel = idf.fit(tf)
val tfidf = idfModel.transform(tf)

import org.apache.spark.ml.feature.{HashingTF, IDF, RegexTokenizer}
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_e7b20beeb502, minTokenLength=1, gaps=true, pattern=\s+, toLowercase=true
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_3458dd4d0b05, binary=false, numFeatures=200000
idf: org.apache.spark.ml.feature.IDF = idf_86c1af0be925
tokenized: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 3 more fields]
tf: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 4 more fields]
idfModel: org.apache.spark.ml.feature.IDFModel = IDFModel: uid=idf_86c1af0be925, numDocs=100000, numFeatures=200000
tfidf: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 5 more fields]


In [7]:
val datafortraining = tfidf.select("Sentiment", "tfidf")
datafortraining.show()

+---------+--------------------+
|Sentiment|               tfidf|
+---------+--------------------+
|        0|(200000,[36403,48...|
|        0|(200000,[16017,26...|
|        1|(200000,[24159,40...|
|        0|(200000,[3987,761...|
|        0|(200000,[27018,55...|
|        0|(200000,[80028,10...|
|        1|(200000,[100307,1...|
|        0|(200000,[3389,339...|
|        1|(200000,[4338,748...|
|        1|(200000,[10295,15...|
|        0|(200000,[112335,1...|
|        1|(200000,[6817,775...|
|        0|(200000,[24618,30...|
|        0|(200000,[28697,32...|
|        0|(200000,[4338,139...|
|        0|(200000,[6166,160...|
|        0|(200000,[23690,23...|
|        1|(200000,[12314,48...|
|        0|(200000,[26265,37...|
|        0|(200000,[12759,16...|
+---------+--------------------+
only showing top 20 rows



datafortraining: org.apache.spark.sql.DataFrame = [Sentiment: int, tfidf: vector]


### Create Logistic Regression Model

In [8]:
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, LogisticRegression}

val lr = new LogisticRegression().setFeaturesCol("tfidf").setLabelCol("Sentiment")
val lrModel = lr.fit(datafortraining)

import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, LogisticRegression}
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_0e6b821fba18
lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid=logreg_0e6b821fba18, numClasses=2, numFeatures=200000


#### Testing on dummy data

In [9]:
val testData = Seq("That is great", "That is awful").toDF
val testTokenized = tokenizer.transform(testData.withColumn("Collapsed", dropRepetitive('value)))
val testFeatures = idfModel.transform(hashingTF.transform(testTokenized))
val testLabeled = lrModel.transform(testFeatures)
testLabeled.select("value", "prediction").show()

+-------------+----------+
|        value|prediction|
+-------------+----------+
|That is great|       1.0|
|That is awful|       0.0|
+-------------+----------+



testData: org.apache.spark.sql.DataFrame = [value: string]
testTokenized: org.apache.spark.sql.DataFrame = [value: string, Collapsed: string ... 1 more field]
testFeatures: org.apache.spark.sql.DataFrame = [value: string, Collapsed: string ... 3 more fields]
testLabeled: org.apache.spark.sql.DataFrame = [value: string, Collapsed: string ... 6 more fields]


### Assembling Pipes

In [10]:
import org.apache.spark.ml.Pipeline

val pipe = new Pipeline().setStages(Array(tokenizer, hashingTF, idf, lr))
val model = pipe.fit(noRepetitiveCharsData)

import org.apache.spark.ml.Pipeline
pipe: org.apache.spark.ml.Pipeline = pipeline_5d1a3ad856fb
model: org.apache.spark.ml.PipelineModel = pipeline_5d1a3ad856fb


#### Testing on dummy data

In [11]:
val testData2 = Seq("That is not great", "That is not awful").toDF.withColumn("Collapsed", dropRepetitive('value))
val testLabeled2 = model.transform(testData2)
testLabeled2.select("value", "prediction").show()

+-----------------+----------+
|            value|prediction|
+-----------------+----------+
|That is not great|       1.0|
|That is not awful|       0.0|
+-----------------+----------+



testData2: org.apache.spark.sql.DataFrame = [value: string, Collapsed: string]
testLabeled2: org.apache.spark.sql.DataFrame = [value: string, Collapsed: string ... 6 more fields]


### Tune parameters

In [12]:
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}

val paramGrid = new ParamGridBuilder()
      .addGrid(lr.tol, Array(1e-20, 1e-10, 1e-5))
      .addGrid(lr.maxIter, Array(100, 200, 300))
      .build()

import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_0e6b821fba18-maxIter: 100,
	logreg_0e6b821fba18-tol: 1.0E-20
}, {
	logreg_0e6b821fba18-maxIter: 100,
	logreg_0e6b821fba18-tol: 1.0E-10
}, {
	logreg_0e6b821fba18-maxIter: 100,
	logreg_0e6b821fba18-tol: 1.0E-5
}, {
	logreg_0e6b821fba18-maxIter: 200,
	logreg_0e6b821fba18-tol: 1.0E-20
}, {
	logreg_0e6b821fba18-maxIter: 200,
	logreg_0e6b821fba18-tol: 1.0E-10
}, {
	logreg_0e6b821fba18-maxIter: 200,
	logreg_0e6b821fba18-tol: 1.0E-5
}, {
	logreg_0e6b821fba18-maxIter: 300,
	logreg_0e6b821fba18-tol: 1.0E-20
}, {
	logreg_0e6b821fba18-maxIter: 300,
	logreg_0e6b821fba18-tol: 1.0E-10
}, {
	logreg_0e6b821fba18-maxIter: 300,
	logreg_0e6b821fba18-tol: 1.0E-5
})


In [13]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val cv = new CrossValidator()
      .setEstimator(pipe)
      .setEvaluator(new BinaryClassificationEvaluator()
      .setRawPredictionCol("prediction")
      .setLabelCol("Sentiment"))
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(3) 
      .setParallelism(2)

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
cv: org.apache.spark.ml.tuning.CrossValidator = cv_9019dd6e7e13


In [14]:
val model = cv.fit(noRepetitiveCharsData)

model: org.apache.spark.ml.tuning.CrossValidatorModel = CrossValidatorModel: uid=cv_9019dd6e7e13, bestModel=pipeline_5d1a3ad856fb, numFolds=3


In [15]:
val result = model.transform(noRepetitiveCharsData)

result: org.apache.spark.sql.DataFrame = [ItemID: int, Sentiment: int ... 8 more fields]


In [16]:
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.classification.LogisticRegressionModel

println("RegParam " + model.bestModel.asInstanceOf[PipelineModel].stages(3).asInstanceOf[LogisticRegressionModel].getRegParam)

RegParam 0.0


import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.classification.LogisticRegressionModel
