In [19]:
from copy import deepcopy

from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram, HashingTF, IDF, CountVectorizer, OneHotEncoder, StringIndexer, Word2Vec
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, StringType
from pyspark.ml.classification import NaiveBayes
#from pyspark.mllib.classification import NaiveBayes#, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

In [2]:


news_data = spark.read.csv('file:///home/hadoop/data/Case_News_Articles.csv', header=True, inferSchema=True)

#news_data.show()

title_category = news_data.select("TITLE", "CATEGORY")

#title_category.show()

from pyspark.sql.functions import *
#title_category.select([count(when(isnull(c), c)).alias(c) for c in title_category.columns]).show()

title_category = title_category.dropna()

#title_category.count()

#title_category.show(truncate=False)

title_category.groupby('Title').count().orderBy(col("count").desc()).show(truncate = False)

# regexp_replace(col('TITLE'), '\d+', '') - Remove all the numbers with blank 
title_category = title_category.withColumn("title_str", regexp_replace(col('TITLE'), '\d+', ''))

#title_category.select("TITLE", "title_str").show(truncate= False)

regex_tokenizer = RegexTokenizer(inputCol="title_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)

#raw_words.show()

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

words_df = remover.transform(raw_words)

words_df.select("words", "filtered_words").show()









+----------------------------------------------------------------------------------+-----+
|Title                                                                             |count|
+----------------------------------------------------------------------------------+-----+
|The article requested cannot be found! Please refresh your browser or go back  ...|145  |
|Business Highlights                                                               |59   |
|Posted by Parvez Jabri                                                            |59   |
|Posted by Imaduddin                                                               |53   |
|Posted by Shoaib-ur-Rehman Siddiqui                                               |52   |
|(click the phrases to see a list)                                                 |51   |
|Business Wire                                                                     |41   |
|PR Newswire                                                                       |38   |

In [3]:
words_df.select("words", "filtered_words").show(truncate=False)


+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|words                                                                                |filtered_words                                                                 |
+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|[fed, official, says, weak, data, caused, by, weather, should, not, slow, taper]     |[fed, official, says, weak, data, caused, weather, slow, taper]                |
|[fed, s, charles, plosser, sees, high, bar, for, change, in, pace, of, tapering]     |[fed, charles, plosser, sees, high, bar, change, pace, tapering]               |
|[us, open, stocks, fall, after, fed, official, hints, at, accelerated, tapering]     |[us, open, stocks, fall, fed, official, hints, accelerated, tapering]    

In [4]:
words_df.printSchema()

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [5]:
cv = CountVectorizer(inputCol="filtered_words", outputCol="features")
model = cv.fit(words_df)
features_data = model.transform(words_df)

features_data.show()

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|               TITLE|CATEGORY|           title_str|               words|      filtered_words|            features|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|Fed official says...|       b|Fed official says...|[fed, official, s...|[fed, official, s...|(49043,[5,42,112,...|
|Fed's Charles Plo...|       b|Fed's Charles Plo...|[fed, s, charles,...|[fed, charles, pl...|(49043,[58,84,112...|
|US open: Stocks f...|       b|US open: Stocks f...|[us, open, stocks...|[us, open, stocks...|(49043,[1,27,112,...|
|Fed risks falling...|       b|Fed risks falling...|[fed, risks, fall...|[fed, risks, fall...|(49043,[5,112,578...|
|Fed's Plosser: Na...|       b|Fed's Plosser: Na...|[fed, s, plosser,...|[fed, plosser, na...|(49043,[112,121,5...|
|Plosser: Fed May ...|       b|Plosser: Fed May ...|[plosser, fed, ma...

In [15]:
indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex").fit(features_data)
#feature_data = indexer.fit(features_data).transform(features_data)
features_data = indexer.transform(features_data)

#features_data.select("CATEGORY", "categoryIndex").show()

features_data.printSchema()

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- categoryIndex: double (nullable = false)



In [12]:

#features_data.select("CATEGORY", "categoryIndex").show()

features_data.printSchema()

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [None]:

train_Data, test_Data = features_data.randomSplit([0.8, 0.2], seed = 11)

# One Hot Encoding - Sklearn , Neural Network
# Sparse matrix - most of entries are zeros
# [0] - [1, 0, 0, 0, 0, 0]
# [1] - [0, 1, 0, 0, 0, 0]
# [2] - [0, 0, 1, 0, 0, 0]
# [3] - [0, 0, 0, 1, 0, 0]
# [4] - [0, 0, 0, 0, 1, 0]
# [5] - [0, 0, 0, 0, 0, 1]

# 
#a - [1, 0, 0, 0]
#b - [0, 1, 0, 0]
#c - [0, 0, 1, 0]
#d - [0, 0, 0, 1]

NB = NaiveBayes(modelType= "multinomial", labelCol="categoryIndex", featuresCol="features" )
nbModel = NB.fit(train_Data)

nb_predictions = nbModel.transform(test_Data)

nb_predictions.select("prediction", "categoryIndex", "features").show(5)

evaluator = MulticlassClassificationEvaluator(labelCol= "categoryIndex", predictionCol = "prediction", 
                                              metricName="accuracy")

nb_accuracy = evaluator.evaluate(nb_predictions)

print("Accuracy : ", nb_accuracy)

nb_predictions.groupby('prediction').count().orderBy(col("count").desc()).show(truncate = False)

test_Data.groupby('Category').count().orderBy(col("count").desc()).show(truncate = False)



+----------+-------------+--------------------+
|prediction|categoryIndex|            features|
+----------+-------------+--------------------+
|       0.0|          0.0|(49043,[167,553,9...|
|       0.0|        161.0|(49043,[268,373,5...|
|       0.0|          0.0|(49043,[20,33,51,...|
|       0.0|          0.0|(49043,[59,950,23...|
|       0.0|         20.0|(49043,[153,325,5...|
+----------+-------------+--------------------+
only showing top 5 rows

