In [None]:
#text_data_path = "file://"+'/home/hadoop/data/Case_News_Articles.csv'
text_data_path = 'data/Case_News_Articles.csv'

In [None]:
try:
    sc.stop()
except:
    print("no sc to stop")


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PysparkNLP')\
    .getOrCreate()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, StringIndexer
from pyspark.sql.functions import col, udf, regexp_replace, isnull
from pyspark.sql.types import IntegerType, StringType

# NaiveBayes model
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
df = spark.read.csv(text_data_path, header=True, inferSchema=True)

In [None]:
# df.show()

In [8]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- PUBLISHER: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- STORY: string (nullable = true)
 |-- HOSTNAME: string (nullable = true)
 |-- TIMESTAMP: string (nullable = true)



In [9]:
df.select("STORY", "PUBLISHER", "TITLE").limit(25).show(truncate=False)

+-----------------------------+----------------------------+---------------------------------------------------------------------------+
|STORY                        |PUBLISHER                   |TITLE                                                                      |
+-----------------------------+----------------------------+---------------------------------------------------------------------------+
|ddUyU0VZz0BRneMioxUPQVP6sIxvM|Los Angeles Times           |Fed official says weak data caused by weather, should not slow taper       |
|ddUyU0VZz0BRneMioxUPQVP6sIxvM|Livemint                    |Fed's Charles Plosser sees high bar for change in pace of tapering         |
|ddUyU0VZz0BRneMioxUPQVP6sIxvM|IFA Magazine                |US open: Stocks fall after Fed official hints at accelerated tapering      |
|ddUyU0VZz0BRneMioxUPQVP6sIxvM|IFA Magazine                |Fed risks falling 'behind the curve', Charles Plosser says                 |
|ddUyU0VZz0BRneMioxUPQVP6sIxvM|Moneynews 

In [10]:
df.count()

422937

In [11]:
df.select('CATEGORY').distinct().show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------+
|CATEGORY                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------+
|http://www.startribune.com/entertainment/tv/259584711.html                                                                          |
|Akron Beacon Journal \(blog\)                                                                                                       |
|GSMArena.com                                                                                                                        |
|MovieViral                                                                                                                          |
|Pitchfork Media                                       

In [12]:
title_category = df.select("TITLE", "CATEGORY")
title_category.show()

+--------------------+--------+
|               TITLE|CATEGORY|
+--------------------+--------+
|Fed official says...|       b|
|Fed's Charles Plo...|       b|
|US open: Stocks f...|       b|
|Fed risks falling...|       b|
|Fed's Plosser: Na...|       b|
|Plosser: Fed May ...|       b|
|Fed's Plosser: Ta...|       b|
|Fed's Plosser exp...|       b|
|US jobs growth la...|       b|
|ECB unlikely to e...|       b|
|ECB unlikely to e...|       b|
|EU's half-baked b...|       b|
|Europe reaches cr...|       b|
|ECB FOCUS-Stronge...|       b|
|EU aims for deal ...|       b|
|Forex - Pound dro...|       b|
|Noyer Says Strong...|       b|
|EU Week Ahead Mar...|       b|
|ECB member Noyer ...|       b|
|Euro Anxieties Wa...|       b|
+--------------------+--------+
only showing top 20 rows



In [13]:
from pyspark.sql.functions import *

title_category.select([count(when(isnull(c), c)).alias(c) for c in title_category.columns]).show()



+-----+--------+
|TITLE|CATEGORY|
+-----+--------+
|  389|     516|
+-----+--------+



In [14]:
title_category = title_category.dropna()

In [15]:
title_category.select("Category").distinct().count()

265

In [16]:
title_category.groupby('Category').count().orderBy(col('count').desc()).show(truncate=False)

+--------------------+------+
|Category            |count |
+--------------------+------+
|e                   |152127|
|b                   |115935|
|t                   |108237|
|m                   |45616 |
|Us Magazine         |31    |
|GossipCop           |20    |
|Contactmusic.com    |20    |
|CBS News            |12    |
|Complex.com         |12    |
|HipHopDX            |11    |
|The Hollywood Gossip|11    |
|HeadlinePlanet.com  |10    |
|We Got This Covered |10    |
|Gamepur             |8     |
|WorstPreviews.com   |7     |
|Consequence of Sound|7     |
|Wetpaint            |7     |
|TooFab.com          |7     |
|The Escapist        |6     |
|Reality TV World    |5     |
+--------------------+------+
only showing top 20 rows



In [34]:
cat_conditions = [(title_category['Category'] == "e"),
                 (title_category['Category'] == "b"),
                 (title_category['Category'] == "t"),
                 (title_category['Category'] == "m")]


title_category.filter((title_category['Category'] == "e") | (title_category['Category'] == "b")\
| (title_category['Category'] == "t") | (title_category['Category'] == "m")\
                     ).count()
# cat_conditions
#title_category.where(title_category['Category'] in ["e","b","t","m"])
#title_category.filter(cat_conditions)

0

In [20]:
title_category.groupby('Category').count().orderBy(col('count').desc()).show(truncate=False)

+--------+-----+
|Category|count|
+--------+-----+
+--------+-----+



In [17]:
title_category.groupby('Title').count().orderBy(col('count').desc()).show(truncate=False)

+----------------------------------------------------------------------------------+-----+
|Title                                                                             |count|
+----------------------------------------------------------------------------------+-----+
|The article requested cannot be found! Please refresh your browser or go back  ...|145  |
|Business Highlights                                                               |59   |
|Posted by Parvez Jabri                                                            |59   |
|Posted by Imaduddin                                                               |53   |
|Posted by Shoaib-ur-Rehman Siddiqui                                               |52   |
|(click the phrases to see a list)                                                 |51   |
|Business Wire                                                                     |41   |
|PR Newswire                                                                       |38   |

In [18]:
title_category = title_category.withColumn("title_str", regexp_replace(col('TITLE'), '\d+', ''))

title_category.select("TITLE", "title_str").show(truncate= False)

+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|TITLE                                                                      |title_str                                                                  |
+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|Fed official says weak data caused by weather, should not slow taper       |Fed official says weak data caused by weather, should not slow taper       |
|Fed's Charles Plosser sees high bar for change in pace of tapering         |Fed's Charles Plosser sees high bar for change in pace of tapering         |
|US open: Stocks fall after Fed official hints at accelerated tapering      |US open: Stocks fall after Fed official hints at accelerated tapering      |
|Fed risks falling 'behind the curve', Charles Plosser says                 

In [19]:
regex_tokenizer = RegexTokenizer(inputCol="title_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)

raw_words.show()

+--------------------+--------+--------------------+--------------------+
|               TITLE|CATEGORY|           title_str|               words|
+--------------------+--------+--------------------+--------------------+
|Fed official says...|       b|Fed official says...|[fed, official, s...|
|Fed's Charles Plo...|       b|Fed's Charles Plo...|[fed, s, charles,...|
|US open: Stocks f...|       b|US open: Stocks f...|[us, open, stocks...|
|Fed risks falling...|       b|Fed risks falling...|[fed, risks, fall...|
|Fed's Plosser: Na...|       b|Fed's Plosser: Na...|[fed, s, plosser,...|
|Plosser: Fed May ...|       b|Plosser: Fed May ...|[plosser, fed, ma...|
|Fed's Plosser: Ta...|       b|Fed's Plosser: Ta...|[fed, s, plosser,...|
|Fed's Plosser exp...|       b|Fed's Plosser exp...|[fed, s, plosser,...|
|US jobs growth la...|       b|US jobs growth la...|[us, jobs, growth...|
|ECB unlikely to e...|       b|ECB unlikely to e...|[ecb, unlikely, t...|
|ECB unlikely to e...|       b|ECB unl

In [20]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
words_df = remover.transform(raw_words)

words_df.select("words", "filtered_words").show()

+--------------------+--------------------+
|               words|      filtered_words|
+--------------------+--------------------+
|[fed, official, s...|[fed, official, s...|
|[fed, s, charles,...|[fed, charles, pl...|
|[us, open, stocks...|[us, open, stocks...|
|[fed, risks, fall...|[fed, risks, fall...|
|[fed, s, plosser,...|[fed, plosser, na...|
|[plosser, fed, ma...|[plosser, fed, ma...|
|[fed, s, plosser,...|[fed, plosser, ta...|
|[fed, s, plosser,...|[fed, plosser, ex...|
|[us, jobs, growth...|[us, jobs, growth...|
|[ecb, unlikely, t...|[ecb, unlikely, e...|
|[ecb, unlikely, t...|[ecb, unlikely, e...|
|[eu, s, half, bak...|[eu, half, baked,...|
|[europe, reaches,...|[europe, reaches,...|
|[ecb, focus, stro...|[ecb, focus, stro...|
|[eu, aims, for, d...|[eu, aims, deal, ...|
|[forex, pound, dr...|[forex, pound, dr...|
|[noyer, says, str...|[noyer, says, str...|
|[eu, week, ahead,...|[eu, week, ahead,...|
|[ecb, member, noy...|[ecb, member, noy...|
|[euro, anxieties,...|[euro, anx

In [21]:
cv = CountVectorizer(inputCol="filtered_words", outputCol="features")

model = cv.fit(words_df)

features_data = model.transform(words_df)

features_data.show()


+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|               TITLE|CATEGORY|           title_str|               words|      filtered_words|            features|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|Fed official says...|       b|Fed official says...|[fed, official, s...|[fed, official, s...|(49043,[5,42,112,...|
|Fed's Charles Plo...|       b|Fed's Charles Plo...|[fed, s, charles,...|[fed, charles, pl...|(49043,[58,84,112...|
|US open: Stocks f...|       b|US open: Stocks f...|[us, open, stocks...|[us, open, stocks...|(49043,[1,27,112,...|
|Fed risks falling...|       b|Fed risks falling...|[fed, risks, fall...|[fed, risks, fall...|(49043,[5,112,579...|
|Fed's Plosser: Na...|       b|Fed's Plosser: Na...|[fed, s, plosser,...|[fed, plosser, na...|(49043,[112,121,5...|
|Plosser: Fed May ...|       b|Plosser: Fed May ...|[plosser, fed, ma...

In [22]:
features_data.printSchema()

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [23]:
#### Multi-label / multi-class classification

# Label encoding by help of StringIndexer
# {apple:1, bear:2, crow:3, dog:4 ....} 

In [24]:

indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")

features_data = indexer.fit(features_data).transform(features_data)



In [25]:
features_data.select("CATEGORY", "CategoryIndex").distinct().show()

+--------------------+-------------+
|            CATEGORY|CategoryIndex|
+--------------------+-------------+
|        The Next Web|        196.0|
|dZzicVR_bvCJYFMk1...|        157.0|
|                  MO|        124.0|
| Boca Raton \(blog\)|        234.0|
|http://www.movies...|        131.0|
|        GSMArena.com|         73.0|
|          MovieViral|        145.0|
| Download & Mixta...|        247.0|
|http://www.newnow...|        199.0|
|          NewNowNext|         33.0|
|http://www.startr...|        259.0|
|         Boing Boing|         96.0|
|Akron Beacon Jour...|        133.0|
|           pen  ..."|        237.0|
|              Cambio|        225.0|
|http://www.hitfix...|         99.0|
|WindowsItPro \(su...|        109.0|
|        The Escapist|         18.0|
|            TechOne3|        257.0|
|Yahoo Singapore N...|        193.0|
+--------------------+-------------+
only showing top 20 rows



In [26]:
features_data.printSchema()


root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- categoryIndex: double (nullable = false)



In [27]:
#### Partitioning train & test data

In [28]:
train, test = features_data.randomSplit([.8,.2], seed = 999)

In [29]:
#### Model training & evaluation

In [30]:
# Naive Bayes
train.printSchema()

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- title_str: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- categoryIndex: double (nullable = false)



In [35]:
NB = NaiveBayes(modelType='multinomial', labelCol='categoryIndex', featuresCol='features')

nb_model= NB.fit(train)
nb_predictions= nb_model.transform(test)

In [32]:
nb_predictions.select("prediction")

DataFrame[prediction: double]

In [33]:

#train.select([count(when(isnull(c), c)).alias(c) for c in title_category.columns]).show()



In [None]:

evaluator = MulticlassClassificationEvaluator(labelCol= "categoryIndex", predictionCol = "prediction", 
                                              metricName="accuracy")

nb_accuracy = evaluator.evaluate(nb_predictions)

print("Accuracy : ", nb_accuracy)