In [1]:
# Importing Libraries
from pyspark.sql import SQLContext
from pyspark import SparkContext

In [2]:
# Creating a Spark Session
from pyspark.sql import SparkSession

In [3]:
Surya = SparkSession.builder.appName("Text Classification").getOrCreate()

In [4]:
# Loading dataset
TC = spark.read.csv("Corona_NLP_train.csv", header = True, inferSchema = True)

In [5]:
TC.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|
|           Stay calm|          stay safe.|                null|

Showing the top 10 rows of the dataset

In [6]:
# To show columns in the Datasets 
TC.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [7]:
TC.count()

68046

In [8]:
TC.toPandas().isnull().sum()

UserName             4
ScreenName       12417
Location         33799
TweetAt          26311
OriginalTweet    26663
Sentiment        39429
dtype: int64

This shows that the data sets contains a lot of null values. We must remove these null values or they can affect the accuracy of our model.

In [10]:
TC = TC.dropna()

In [11]:
TC.toPandas().isnull().sum()

UserName         0
ScreenName       0
Location         0
TweetAt          0
OriginalTweet    0
Sentiment        0
dtype: int64

Here the null values of the data set has been removed. 

In [12]:
TC.count()

22358

In [13]:
TC.show()

+--------+----------+--------------------+----------+--------------------+------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|
+--------+----------+--------------------+----------+--------------------+------------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...|          Positive|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...|          Positive|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...|          Positive|
|    3807|     48759|     Atlanta, GA USA|16-03-2020|Due to COVID-19 o...|          Positive|
|    3808|     48760|    BHAVNAGAR,GUJRAT|16-03-2020|For corona preven...|          Negative|
|    3809|     48761|      Makati, Manila|16-03-2020|All mon

Data set after removing the null values 

In [14]:
from pyspark.sql.functions import length

In [15]:
TC = TC.withColumn('Tweet_length', length(TC['OriginalTweet']))

In [17]:
TC.show(5)

+--------+----------+--------------------+----------+--------------------+---------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+---------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...| Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...| Positive|         184|
+--------+----------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [18]:
sentiments = ['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [19]:
AB = TC.filter(TC.Sentiment.isin(sentiments))

In [20]:
AB.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [21]:
AB.select('Sentiment').distinct().count()

5

In [22]:
AB.groupby('Sentiment').count().show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|Extremely Negative| 2889|
|           Neutral| 4128|
|          Positive| 6041|
|          Negative| 5261|
|Extremely Positive| 3491|
+------------------+-----+



Here the data is grouped on the basis of the sentiments of the tweets

In [58]:
AB.groupby('Location').count().show()

+--------------------+-----+
|            Location|count|
+--------------------+-----+
|                 ...|    1|
| Mumbai, Maharashtra|    3|
| Brisbane, Australia|    4|
|West Woofle-Dust ...|    1|
|   St Petersburg, FL|    7|
| All across Michigan|    1|
|     Northumberland |    1|
|     stoke on trent |    1|
|some where around...|    1|
|           Bangalore|   19|
|           Norn Iron|    1|
|Horsham, Pennsylv...|    1|
|       Shimla  India|    1|
|Ferrara, Emilia R...|    1|
|      Luton, England|    1|
|              Heaven|    1|
|       St George, UT|    1|
|Just to the left ...|    1|
|           Worcester|    2|
|      Nellore/Canada|    1|
+--------------------+-----+
only showing top 20 rows



Maximum number of tweets are from Bangalore Location

In [23]:
AB.show(10)

+--------+----------+--------------------+----------+--------------------+------------------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+------------------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...|          Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...|          Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...|          Positive|         184|
|    3807|     48759|     Atlanta, GA USA|16-03-2020|Due to COVID-19 o...|          Positive|         280|
|    3808|     48760|    BHAVNAGAR,GU

In [24]:
print((AB.count(),len(AB.columns)))

(21810, 7)


In [25]:
# Load packages
import pyspark.ml.feature

In [26]:
dir(pyspark.ml.feature)

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasAggregationDepth',
 'HasBlockSize',
 'HasCheckpointInterval',
 'HasCollectSubModels',
 'HasDistanceMeasure',
 'HasElasticNetParam',
 'HasFeaturesCol',
 'HasFitIntercept',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasLoss',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasParallelism',
 'HasPredictionCol',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasRelativeError',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasValidationIndicatorCol',
 'HasVarianceCol',
 'HasWeightCol',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'Interaction',
 'JavaEstimator',
 'JavaMLReadable',
 'Jav

In [29]:
# Importing Libraries 
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [48]:
# Stages of Pipeline
tokenizer = Tokenizer(inputCol='OriginalTweet', outputCol='token_text')
stopwords_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
vectorizer = CountVectorizer(inputCol='stop_token', outputCol='RawTweets')
idf = IDF(inputCol='RawTweets', outputCol='Tweets')

#Converting Labels to Numeric
labeltonum=StringIndexer(inputCol="Sentiment", outputCol="label")

In [49]:
CD = VectorAssembler(inputCols=["Tweets","Tweet_length"], outputCol="features")

# Model

In [50]:
from pyspark.ml.feature import NGram
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier

nb=NaiveBayes()
rf=RandomForestClassifier(numTrees=100)
dtc=DecisionTreeClassifier(maxDepth=15)

from pyspark.ml import Pipeline

In [59]:
pipeline = Pipeline(stages=[labeltonum, tokenizer, stopwords_remover, vectorizer, idf, CD])

In [60]:
pipeline

Pipeline_43a3881c98d7

In [61]:
pipeline.stages

Param(parent='Pipeline_43a3881c98d7', name='stages', doc='a list of pipeline stages')

In [62]:
EF = pipeline.fit(AB)

In [63]:
GH = EF.transform(AB)

In [64]:
GH.show(5)

+--------+----------+--------------------+----------+--------------------+---------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|label|          token_text|          stop_token|           RawTweets|              Tweets|            features|
+--------+----------+--------------------+----------+--------------------+---------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|  2.0|[@menyrbie, @phil...|[@menyrbie, @phil...|(66313,[14329,347...|(66313,[14329,347...|(66314,[14329,347...|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|  0.0|[advice, talk, to...|[advice, talk, ne...|(66313,[14,15,133..

In [65]:
GH = GH.select(['label', 'features'])

In [66]:
GH.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(66314,[14329,347...|
|  0.0|(66314,[14,15,133...|
|  0.0|(66314,[7,15,35,7...|
|  0.0|(66314,[7,8,31,47...|
|  0.0|(66314,[4,6,18,60...|
|  0.0|(66314,[1,6,7,14,...|
|  1.0|(66314,[11,14,15,...|
|  2.0|(66314,[51,73,151...|
|  3.0|(66314,[14,15,24,...|
|  0.0|(66314,[3,7,22,39...|
|  4.0|(66314,[1,3,9,11,...|
|  1.0|(66314,[3,21,44,7...|
|  3.0|(66314,[10,35,50,...|
|  1.0|(66314,[3,7,22,32...|
|  4.0|(66314,[1,8,11,37...|
|  2.0|(66314,[5,47,51,6...|
|  0.0|(66314,[7,12,24,2...|
|  0.0|(66314,[5,19,20,4...|
|  4.0|(66314,[0,3,12,21...|
|  2.0|(66314,[0,8,14,15...|
+-----+--------------------+
only showing top 20 rows



# ML Training 

In [72]:
# Spliting Dataset into Train and Test Data
(traintc, testtc) = GH.randomSplit((0.7, 0.3))

In [73]:
PredictNB = nb.fit(traintc)

In [74]:
PredictRF= rf.fit(traintc)

# Testing the Model

In [75]:
NB_results = PredictNB.transform(testtc)

In [76]:
RF_results = PredictRF.transform(testtc)

In [77]:
NB_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(66314,[0,1,2,8,4...|[-1266.1194558574...|[1.62972945933352...|       4.0|
|  0.0|(66314,[0,1,2,12,...|[-1131.7293835562...|[0.05141633318800...|       3.0|
|  0.0|(66314,[0,1,2,13,...|[-1693.7115663206...|[7.59525903638559...|       3.0|
|  0.0|(66314,[0,1,2,16,...|[-1180.1247174083...|[0.99993722733317...|       0.0|
|  0.0|(66314,[0,1,2,25,...|[-2008.1332073200...|[1.0,6.3180101352...|       0.0|
|  0.0|(66314,[0,1,3,4,1...|[-1389.6572124391...|[4.61765690218920...|       2.0|
|  0.0|(66314,[0,1,3,5,1...|[-818.96975489271...|[0.33814644066027...|       3.0|
|  0.0|(66314,[0,1,3,10,...|[-1909.5191209775...|[1.95112279013779...|       1.0|
|  0.0|(66314,[0,1,3,31,...|[-1517.8262106313...|[5.00956386455401...|       1.0|
|  0.0|(66314,[0

In [78]:
RF_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(66314,[0,1,2,8,4...|[27.7160621412236...|[0.27716062141223...|       0.0|
|  0.0|(66314,[0,1,2,12,...|[27.8997586358119...|[0.27899758635811...|       0.0|
|  0.0|(66314,[0,1,2,13,...|[27.5742436380678...|[0.27574243638067...|       0.0|
|  0.0|(66314,[0,1,2,16,...|[27.6698150036080...|[0.27669815003608...|       0.0|
|  0.0|(66314,[0,1,2,25,...|[27.9772419567301...|[0.27977241956730...|       0.0|
|  0.0|(66314,[0,1,3,4,1...|[27.6694643212146...|[0.27669464321214...|       0.0|
|  0.0|(66314,[0,1,3,5,1...|[28.1452853515514...|[0.28145285351551...|       0.0|
|  0.0|(66314,[0,1,3,10,...|[27.1912615929773...|[0.27191261592977...|       0.0|
|  0.0|(66314,[0,1,3,31,...|[27.7556923791544...|[0.27755692379154...|       0.0|
|  0.0|(66314,[0

In [79]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [80]:
eva = MulticlassClassificationEvaluator()
acc_NB = eva.evaluate(NB_results)

In [81]:
eva = MulticlassClassificationEvaluator()
acc_RF = eva.evaluate(RF_results)

In [82]:
print("accuracy of the NB and RF is ::", acc_NB, acc_RF)

accuracy of the NB and RF is :: 0.3910757327772677 0.12258262788347707


# Thank You