# Accuracy Test using Logistic Regression

## 1. Install findspark library

In [1]:
pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Note: you may need to restart the kernel to use updated packages.


## 2. Import pyspark along with sql context and findspark

In [13]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

## 3. Create spark context 

In [14]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[1]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")



## 4. Load the accuracy_ready_data.csv 

In [22]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('accuracy_ready_data.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [23]:
df.show(5)

+--------------------+---------------------+----------------+-------------------+
|          News_title|Sentiment(MaxEntropy)|Sentiment(VADER)|Sentiment(SparkNLP)|
+--------------------+---------------------+----------------+-------------------+
|MSMEs call for mo...|                    2|               2|                  0|
|Rethinking touris...|                    2|               2|                  1|
|Why Are We Spendi...|                    2|               2|                  0|
|Why Are We Spendi...|                    2|               2|                  0|
|Issue of film ind...|                    2|               2|                  1|
+--------------------+---------------------+----------------+-------------------+
only showing top 5 rows



## 5. Split the data into 80% train, 10% test and 10% validation

In [24]:
(train_set, val_set, test_set) = df.randomSplit([0.80, 0.10, 0.10], seed = 200)

## 6. Implementing pipeline and logistic regression for accuracy

### 1) VADER

In [25]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

#tokenizer each word in the text
tokenizer = Tokenizer(inputCol="News_title", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

#select target sentiment as label data for classification
label_stringIdx = StringIndexer(inputCol = "Sentiment(VADER)", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)# fitting the training data on pipeline
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)


In [26]:
#Applying Logistic Regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)


0.7802893366972183

### 2) MaxEntropy

In [27]:
tokenizer = Tokenizer(inputCol="News_title", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

#select target sentiment as label data for classification
label_stringIdx = StringIndexer(inputCol = "Sentiment(MaxEntropy)", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)# fitting the training data on pipeline
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)


In [28]:
#Applying Logistic Regression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)


0.7056808449900085

### 3) Spark NLP

In [29]:
tokenizer = Tokenizer(inputCol="News_title", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

#select target sentiment as label data for classification
label_stringIdx = StringIndexer(inputCol = "Sentiment(SparkNLP)", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)# fitting the training data on pipeline
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)


In [30]:
#Applying Logistic Regression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)


0.8434440490864795