#### Problem Statement: Predict the category (business, entertainment, etc.) of a news article given only its headline

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master('local[5]').config('spark.driver.memory','16g').getOrCreate()

Starting the Spark Session

Import Important modules required 

In [None]:
from pyspark.ml import Pipeline 
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Now we are loading the dataset uci-news-aggregator.csv.<br>
This dataset contains headlines, URLs, and categories for 422,937 news stories collected by a web aggregator between March 10th, 2014 and August 10th, 2014.

News categories included in this dataset include business; science and technology; entertainment; and health. Different news articles that refer to the same news item (e.g., several articles about recently released employment statistics) are also categorized together.

In [None]:
#read news csv dataset from the working directory
news_data = spark.read.csv('uci-news-aggregator.csv',header= True)
news_data.printSchema()
news_data.show()

We can check the count of totalitems in the dataset for analysis

In [None]:
#count data items present in the set
news_data.count()

We are selecting the titles of tweets and the corresponding category of each tweet

In [None]:
title_category = news_data.select("TITLE","CATEGORY")
title_category.show()

This is the custom function definition to count the null values

In [None]:
#definition to count the null values
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

We are applying the custom function to the data frame title_category

In [None]:
null_columns_count_list = null_value_count(title_category)

In [None]:
null_columns_count_list

# Cleaning the dataset

Now we can drop the null values

In [None]:
#Drop null and not available values
title_category = title_category.dropna()
title_category.count()

In [None]:
title_category.show(truncate=False)

In [None]:
title_category.select("Category").distinct().count()

In [None]:
title_category.groupBy("TITLE").count().orderBy(col("count").desc()).show(truncate=False)

In [None]:
title_category.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

Now let us remove the numbers present in the title category

In [None]:
#clean numbers and other unwanted characters from the tweets
title_category = title_category.withColumn("only_str",regexp_replace(col('TITLE'), '\d+', ''))
title_category.select("TITLE","only_str").show(truncate=False)

Split the text into constituent words

In [None]:
#split the text to tokens using tokenizer
#https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.RegexTokenizer.html
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)
raw_words.show()

Remove the stop words from segregated list of words

In [None]:
#Remove and segregate stop words form the word list like for, by, in etc.
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)
words_df.select("words","filtered").show(truncate=False)

The category column in the dataframe can now be mapped to categoryIndex

In [None]:
#Index the string for different category
indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")
feature_data = indexer.fit(words_df).transform(words_df)
feature_data.select("CATEGORY","categoryIndex").show()

Convert text into vectors of token counts

In [None]:
#converting text to vectors and count the tokens
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(feature_data)
countVectorizer_feateures = model.transform(feature_data)

# Partition the dataset into training and test datasets


In [None]:
(trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2],seed = 11)
trainingData.show()
testData.show()

# Model Training and Prediction

## Naive Bayes Model

In [None]:
nb = NaiveBayes(modelType="multinomial",labelCol="categoryIndex", featuresCol="features")
nbModel = nb.fit(trainingData)
nb_predictions = nbModel.transform(testData)

In [None]:
nb_predictions.show()

In [None]:
nb_predictions1 = nb_predictions.select("prediction", "categoryIndex", "features")

In [None]:
nb_predictions1.show()

In [None]:
## Evaluating the model
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions1)
print("Accuracy of NB is = %g"% (nb_accuracy))

In [None]:
spark.stop()