# Setup

In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u382-ga-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import SQLContext
from pyspark import SparkContext

Initialize spark context

In [None]:
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

ValueError: ignored

In [None]:
sqlContext = SQLContext(spark)



Read in the data

In [None]:
train = spark.read.csv('/content/train.csv', header=True, inferSchema=True)
test = spark.read.csv('/content/test.csv', header=True, inferSchema=True)

# Prepare the Data

In [None]:
train.printSchema()
train.show(10)

root
 |-- ItemID: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- SentimentText: string (nullable = true)

+------+---------+--------------------+
|ItemID|Sentiment|       SentimentText|
+------+---------+--------------------+
|     1|        0|                 ...|
|     2|        0|                 ...|
|     3|        1|              omg...|
|     4|        0|          .. Omga...|
|     5|        0|         i think ...|
|     6|        0|         or i jus...|
|     7|        1|       Juuuuuuuuu...|
|     8|        0|       Sunny Agai...|
|     9|        1|      handed in m...|
|    10|        1|      hmmmm.... i...|
+------+---------+--------------------+
only showing top 10 rows



In [None]:
test.printSchema()

root
 |-- ItemID: integer (nullable = true)
 |-- SentimentText: string (nullable = true)



In [None]:
from pyspark.sql.functions import col

train.groupBy("Sentiment").count().orderBy(col("count").desc()).show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        1|56462|
|        0|43538|
+---------+-----+



In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="SentimentText", outputCol="words", pattern="\\W")

# stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(["http","https","amp","rt","t","c","the"])

#converts words into numerical values
hashingTF = HashingTF(inputCol="filtered", outputCol="features")
hashingTF.setNumFeatures(1000)
idf= IDF(inputCol="features", outputCol="rawfeatures", minDocFreq=5)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Sentiment", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(train)
dataset = pipelineFit.transform(train)
dataset.show(5)

+------+---------+--------------------+--------------------+--------------------+--------------------+-----+
|ItemID|Sentiment|       SentimentText|               words|            filtered|            features|label|
+------+---------+--------------------+--------------------+--------------------+--------------------+-----+
|     1|        0|                 ...|[is, so, sad, for...|[is, so, sad, for...|(1000,[21,209,344...|  1.0|
|     2|        0|                 ...|[i, missed, the, ...|[i, missed, new, ...|(1000,[208,415,58...|  1.0|
|     3|        1|              omg...|[omg, its, alread...|[omg, its, alread...|(1000,[15,86,178,...|  0.0|
|     4|        0|          .. Omga...|[omgaga, im, sooo...|[omgaga, im, sooo...|(1000,[102,162,28...|  1.0|
|     5|        0|         i think ...|[i, think, mi, bf...|[i, think, mi, bf...|(1000,[18,46,209,...|  1.0|
+------+---------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 

In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(dataset)

predictions = lrModel.transform(dataset)

predictions.filter(predictions['prediction'] == 0).select("SentimentText","Sentiment","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 100, truncate = 30)

+------------------------------+---------+------------------------------+-----+----------+
|                 SentimentText|Sentiment|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|@ashleytisdale you are a go...|        1|[0.9387326400548155,0.06126...|  0.0|       0.0|
| homeish lol..hella fun day...|        1|[0.9384767954529726,0.06152...|  0.0|       0.0|
|@aaddiiss hahaha yea you're...|        1|[0.9361693844028244,0.06383...|  0.0|       0.0|
|@11Locs @Cortnee4Christ You...|        1|[0.9344608776462744,0.06553...|  0.0|       0.0|
|@_CrC_ Happy birthday to yo...|        1|[0.9315554002482325,0.06844...|  0.0|       0.0|
|@_jaimemarie Oh, thank you,...|        1|[0.9306019157554524,0.06939...|  0.0|       0.0|
|@AlexAllTimeLow I'd do the ...|        1|[0.9285579230950202,0.07144...|  0.0|       0.0|
|&quot;There are only 2 kind...|        1|[0.9221225158522522,0.07787...|  0.0|       0.0|