In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF,Tokenizer,CountVectorizer
from pyspark.ml.classification import LogisticRegression

In [3]:
spark=SparkSession.builder.master('local[*]').appName('Twitter Sentiment Analyser').getOrCreate()

In [4]:
df=spark.read.load('./clean_tweet.csv',format="csv",header=True,inferSchema=True)

In [5]:
df.show(5)

+---+--------------------+------+
|_c0|                text|target|
+---+--------------------+------+
|  0|awww that bummer ...|     0|
|  1|is upset that he ...|     0|
|  2|dived many times ...|     0|
|  3|my whole body fee...|     0|
|  4|no it not behavin...|     0|
+---+--------------------+------+
only showing top 5 rows



In [6]:
df.count()

1600000

In [7]:
df=df.na.drop()
df.count()

1596041

In [8]:
(train_set, val_set, test_set)=df.randomSplit([0.98,0.01,0.01],seed=2000)

In [9]:
train_set.count(),val_set.count(),test_set.count()

(1564255, 15795, 15991)

In [11]:
tokenizer=Tokenizer(inputCol='text',outputCol='words')

In [13]:
countvectorizer=CountVectorizer(vocabSize=2**16,inputCol='words',outputCol='termvector')

In [14]:
idf=IDF(inputCol='termvector',outputCol='features',minDocFreq=5)

In [18]:
lr=LogisticRegression(labelCol='target',maxIter=100)

In [19]:
pipeline=Pipeline(stages=[tokenizer,countvectorizer,idf,lr])

In [20]:
model=pipeline.fit(train_set)

In [21]:
predictions=model.transform(val_set)

In [23]:
accuracy = predictions.filter(predictions.target == predictions.prediction).count() / float(val_set.count())

In [25]:
print("Validation Accuracy=={0:.4f}".format(accuracy))

Accuracy==0.7914


In [26]:
final_pred=model.transform(test_set)

In [27]:
final_acc=final_pred.filter(final_pred.target==final_pred.prediction).count()/float(test_set.count())

In [28]:
print("Final Accuracy on test set=={0:.4f}".format(final_acc))

Final Accuracy on test set==0.7990
