# TEXT CLASSIFICATION USING PYSPARK AND MLLIB

### Create DATA FRAME in PYSPARK

In [2]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [3]:
import os
#path = os.chdir('./')

from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [4]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, StopWordsRemover, IDF, Tokenizer
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.mllib.linalg import Vector

### Read Downloaded Data Files ; Create Learning DataSet

In [5]:
dirpath = 'data/NYT-articles/*'
NYTRawData = sc.wholeTextFiles(dirpath)

In [6]:
#print("The number of documents read in is " + NYTRawData.count() + ".")
NYTRawData.count()

452

### Create UNKNOWN Dataset

In [7]:
dirpathUN = 'data/Validation set/*'
NYTRawDataUNKNOWN = sc.wholeTextFiles(dirpathUN)

In [8]:
#print("The number of documents read in is " + NYTRawData.count() + ".")
NYTRawDataUNKNOWN.count()

137

### Display Sample Data from Learning Dataset 

In [9]:
NYTRawData.takeSample(False,1, seed = 231279)

[('file:/home/h4x3d/Documents/Document-classification-pyspark/data/NYT-articles/sports/5ae48c77068401528a2ab3c3',
  'The Mets’ Posture Improved Along With This Pitcher’s\nSAN DIEGO — Like many people, pitcher Robert Gsellman, a surprising and emerging force in the Mets’ bullpen, loves playing video games. His console of choice: Xbox One. The games in heavy rotation: FIFA, Call of Duty: WW II and, until recently, NBA 2K, which he ditched because he grew tired of losing to his friends.\nBut a problem with the way Gsellman, 24, was playing those video games undermined his pitching. In front of a screen, he hunched over, elbows on his knees. “I was sitting like that all the time,” he said.\nThis poor posture, Gsellman believed, developed after surgery to repair torn cartilage in his left (nonthrowing) shoulder before the 2017 season. Although his arm was in a cast for only three weeks, it created a bad habit of leaning his shoulders forward and curving his back.\nSo how could that affect h

### Display Sample Data from UNKNOWN Dataset 

In [10]:
NYTRawDataUNKNOWN.takeSample(False,1, seed = 231279)

[('file:/home/h4x3d/Documents/Document-classification-pyspark/data/Validation set/sports/5aa2fc34068401528a29237c',
  "With Checkups and Chickpea Pasta, the Mets Get Healthier. But Better?\nPORT ST. LUCIE, Fla. — After a season in which several high-profile injuries exposed deficiencies in the medical treatment and training of players, the Mets promised an overhaul. Spring training has provided a glimpse of what they meant.\nEvery day, players tap an iPad to record their sleeping habits and the color of their urine, to help assess if they are drinking enough water. At the clubhouse buffet, chickpea pasta is in, but chicken fingers are out. And a new job was created to direct the staff members how to coax the best performance out of the players.\n“Everything is a work in progress,” said Jim Cavallini, 39, the Mets’ director of performance and sports science, a position new for the team and relatively new across the sport. “We’re going to look at anything and everything that can make our

In [11]:
filepath = NYTRawData.map(lambda x:x[0]).collect()

#### Filter RDD to Capture Text

In [12]:
text = NYTRawData.map(lambda x:x[1]).collect()

In [13]:
textUN = NYTRawDataUNKNOWN.map(lambda x:x[1]).collect()

#### Convert to DataFrame
##### Learning Dataframe = "df"
##### Unknown Dataframe =" dfUN"

In [14]:
from pyspark.sql.types import Row

#here you are going to create a function
def f(x):
    d = {}
    for i in range(len(x)):
        d[str(i)] = x[i]
    return d

#Now populate that
df = NYTRawData.map(lambda x: Row(**f(x))).toDF()#.withColumn("Label",lit("Politics"))
dfUN = NYTRawDataUNKNOWN.map(lambda x: Row(**f(x))).toDF()

In [15]:
df.columns

['0', '1']

In [16]:
dfUN.columns

['0', '1']

In [17]:
df.show(4)

+--------------------+--------------------+
|                   0|                   1|
+--------------------+--------------------+
|file:/home/h4x3d/...|This Week: Al Pac...|
|file:/home/h4x3d/...|Remembering the C...|
|file:/home/h4x3d/...|Was Ashlee Simpso...|
|file:/home/h4x3d/...|8 Classical Music...|
+--------------------+--------------------+
only showing top 4 rows



#### Prepare Learning Dataset for Modeling using Classification Models
###### Split Columns to get Category of each Article

In [18]:
from pyspark.sql.functions import split
split_col = split(df['0'], '/')

In [19]:
df = df.withColumn('NAME6', split_col.getItem(6))
df = df.withColumn('NAME7', split_col.getItem(7))

In [20]:
df.show()

+--------------------+--------------------+------------+-----+
|                   0|                   1|       NAME6|NAME7|
+--------------------+--------------------+------------+-----+
|file:/home/h4x3d/...|This Week: Al Pac...|NYT-articles| arts|
|file:/home/h4x3d/...|Remembering the C...|NYT-articles| arts|
|file:/home/h4x3d/...|Was Ashlee Simpso...|NYT-articles| arts|
|file:/home/h4x3d/...|8 Classical Music...|NYT-articles| arts|
|file:/home/h4x3d/...|Brothers Osborne ...|NYT-articles| arts|
|file:/home/h4x3d/...|Old Crow Medicine...|NYT-articles| arts|
|file:/home/h4x3d/...|Review: A Pulitze...|NYT-articles| arts|
|file:/home/h4x3d/...|Review: Jonas Kau...|NYT-articles| arts|
|file:/home/h4x3d/...|He Made Kids’ Mus...|NYT-articles| arts|
|file:/home/h4x3d/...|10 Treasures, Une...|NYT-articles| arts|
|file:/home/h4x3d/...|Music Is a Sanctu...|NYT-articles| arts|
|file:/home/h4x3d/...|Heavenly Hymn: Th...|NYT-articles| arts|
|file:/home/h4x3d/...|Rafiq Bhatia Is W...|NYT-articles

In [21]:
df.printSchema()

root
 |-- 0: string (nullable = true)
 |-- 1: string (nullable = true)
 |-- NAME6: string (nullable = true)
 |-- NAME7: string (nullable = true)



In [22]:
drop_list = ['0', 'NAME6']#, 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
df = df.select([column for column in df.columns if column not in drop_list])
df.show(5)
df.printSchema()

+--------------------+-----+
|                   1|NAME7|
+--------------------+-----+
|This Week: Al Pac...| arts|
|Remembering the C...| arts|
|Was Ashlee Simpso...| arts|
|8 Classical Music...| arts|
|Brothers Osborne ...| arts|
+--------------------+-----+
only showing top 5 rows

root
 |-- 1: string (nullable = true)
 |-- NAME7: string (nullable = true)



In [23]:

from pyspark.sql.functions import col
df = df.select(col("1").alias("Article"), col("NAME7").alias("Category"))
df.show()

+--------------------+--------+
|             Article|Category|
+--------------------+--------+
|This Week: Al Pac...|    arts|
|Remembering the C...|    arts|
|Was Ashlee Simpso...|    arts|
|8 Classical Music...|    arts|
|Brothers Osborne ...|    arts|
|Old Crow Medicine...|    arts|
|Review: A Pulitze...|    arts|
|Review: Jonas Kau...|    arts|
|He Made Kids’ Mus...|    arts|
|10 Treasures, Une...|    arts|
|Music Is a Sanctu...|    arts|
|Heavenly Hymn: Th...|    arts|
|Rafiq Bhatia Is W...|    arts|
|A Pianist’s Big P...|    arts|
|14 Pop, Rock and ...|    arts|
|SummerStage Will ...|    arts|
|‘Parsifal’ Return...|    arts|
|Abreu’s Legacy: T...|    arts|
|The Pianist of th...|    arts|
|SZA Almost Quit M...|    arts|
+--------------------+--------+
only showing top 20 rows



In [24]:
df.show(500)

+--------------------+--------+
|             Article|Category|
+--------------------+--------+
|This Week: Al Pac...|    arts|
|Remembering the C...|    arts|
|Was Ashlee Simpso...|    arts|
|8 Classical Music...|    arts|
|Brothers Osborne ...|    arts|
|Old Crow Medicine...|    arts|
|Review: A Pulitze...|    arts|
|Review: Jonas Kau...|    arts|
|He Made Kids’ Mus...|    arts|
|10 Treasures, Une...|    arts|
|Music Is a Sanctu...|    arts|
|Heavenly Hymn: Th...|    arts|
|Rafiq Bhatia Is W...|    arts|
|A Pianist’s Big P...|    arts|
|14 Pop, Rock and ...|    arts|
|SummerStage Will ...|    arts|
|‘Parsifal’ Return...|    arts|
|Abreu’s Legacy: T...|    arts|
|The Pianist of th...|    arts|
|SZA Almost Quit M...|    arts|
|Review: American ...|    arts|
|Logic, XXXTentaci...|    arts|
|Meek Mill Is Rele...|    arts|
|Pop, Rock and Jaz...|    arts|
|Kendrick Lamar Wi...|    arts|
|Cardi B Arrives a...|    arts|
|Can Classical Mus...|    arts|
|How a Philly Chee...|    arts|
|Review:

##### Learning Data Set 

##### (Collection of approx 589 articles in 4 categories from NYT)

In [25]:
from pyspark.sql.functions import col
df.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|Category|count|
+--------+-----+
|politics|  147|
|    arts|  126|
|  sports|  107|
|business|   72|
+--------+-----+



#### Prepare UNKNOWN DataSet for Testing using CLASSIFICATION MODELS

In [26]:

split_col = split(dfUN['0'], '/')
dfUN = dfUN.withColumn('NAME6', split_col.getItem(6))
dfUN = dfUN.withColumn('NAME7', split_col.getItem(7))
dfUN.printSchema()
drop_list = ['0', 'NAME6']#, 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
dfUN = dfUN.select([column for column in dfUN.columns if column not in drop_list])
#dfUN.show(5)
dfUN = dfUN.select(col("1").alias("Article"), col("NAME7").alias("Category"))
#dfUN.show()
from pyspark.sql.functions import col
dfUN.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

root
 |-- 0: string (nullable = true)
 |-- 1: string (nullable = true)
 |-- NAME6: string (nullable = true)
 |-- NAME7: string (nullable = true)

+--------+-----+
|Category|count|
+--------+-----+
|  sports|   48|
|politics|   41|
|    arts|   25|
|business|   23|
+--------+-----+



### DIvide  Learning Data Set into Training and Test

In [27]:
# set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed = 231279)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 369
Test Dataset Count: 83


In [28]:
trainingData.show()

+--------------------+--------+
|             Article|Category|
+--------------------+--------+
|10 Treasures, Une...|    arts|
|13 Pop, Rock and ...|    arts|
|13 Pop, Rock and ...|    arts|
|13 Pop, Rock and ...|    arts|
|14 Pop, Rock and ...|    arts|
|14 Pop, Rock and ...|    arts|
|14 Pop, Rock and ...|    arts|
|2018 N.F.L. Draft...|  sports|
|28 Years After Hi...|    arts|
|6 Classical Music...|    arts|
|6 Classical Music...|    arts|
|6 Classical Music...|    arts|
|6 Classical Music...|    arts|
|7 Classical Music...|    arts|
|7 Classical Music...|    arts|
|8 Classical Music...|    arts|
|8 Classical Music...|    arts|
|8 Classical Music...|    arts|
|A K-Pop Primer fo...|    arts|
|A Long-Lost Compo...|    arts|
+--------------------+--------+
only showing top 20 rows



In [29]:
testData.show()

+--------------------+--------+
|             Article|Category|
+--------------------+--------+
|100 Years After D...|    arts|
|15 Pop, Rock and ...|    arts|
|6 Classical Music...|    arts|
|Arsenal's Iwobi C...|  sports|
|Brazilian Footbal...|  sports|
|Brian Eno Wants t...|    arts|
|Classical Music i...|    arts|
|Exquisite Antonac...|    arts|
|FIFA Bans Brazili...|  sports|
|Formula One to La...|  sports|
|Heavenly Hymn: Th...|    arts|
|Highlights: Goldm...|business|
|Home Is Where the...|  sports|
|How Big Can the N...|  sports|
|I'm Judged Differ...|  sports|
|IAAF Legal Expert...|  sports|
|James Scores 43 a...|  sports|
|Lincoln Center’s ...|    arts|
|Meek Mill Is Rele...|    arts|
|NBA Asks Raptors ...|  sports|
+--------------------+--------+
only showing top 20 rows



#### DISPLAY UNknown DF

In [30]:
dfUN.show()

+--------------------+--------+
|             Article|Category|
+--------------------+--------+
|Review: Evgeny Ki...|    arts|
|Spinning Melody: ...|    arts|
|An Orchestra’s Ec...|    arts|
|J. Cole to Launch...|    arts|
|Nora Fischer Burs...|    arts|
|5 ‘Schoolhouse Ro...|    arts|
|14 Pop, Rock and ...|    arts|
|R. Kelly Again De...|    arts|
|A Tone Parallel t...|    arts|
|Pirate Radio Stat...|    arts|
|Review: Dudamel a...|    arts|
|Tumult at an Ital...|    arts|
|7 Classical Music...|    arts|
|R. Kelly Faces a ...|    arts|
|San Francisco Con...|    arts|
|The Playlist: Chr...|    arts|
|Review: Odd, Deli...|    arts|
|Liz Phair Is Not ...|    arts|
|The Playlist: Wil...|    arts|
|14 Pop, Rock and ...|    arts|
+--------------------+--------+
only showing top 20 rows



## Classification  Using Logistic Regression
**LogisticRegression** is a method used to predict a binary response. The current implementation of logistic regression in spark.ml only supports binary classes. Support for multiclass regression will be added in the future.
### Train the Learning Dataset (TRAIN And TEst)
### Build Pipeline using TF IDF 

In machine learning, it is common to run a sequence of algorithms to process and learn from data. Spark ML represents such a workflow as a Pipeline, which consists of a sequence of PipelineStages (Transformers and Estimators) to be run in a specific order. The pipeline we are using in this example consists of four stages: Tokenizer, StopWordsRemover, HashingTF, Inverse Document Frequency (IDF) and LogisticRegression.

**Tokenizer** splits the raw text documents into words, adding a new column with words into the dataset.

**StopWordsRemover** takes as input a sequence of strings and drops all the stop words from the input sequences. Stop words are words which should be excluded from the input, typically because the words appear frequently and don’t carry as much meaning. A list of stop words by default. Optionally you can provide a list of stopwords. We will just use the defualt list of stopwords.

**HashingTF** takes sets of terms and converts those sets into fixed-length feature vectors. 

**Inverse Document Frequency (IDF)** is a numerical measure of how much information a term provides. If a term appears very often across the corpus, it means it doesn’t carry special information about a particular document. IDF down-weights terms which appear frequently in a corpus.



###### Our model will make predictions and score on the test set; we then look at the top 10 predictions from the highest probability.

In [31]:
from pyspark.ml.feature import RegexTokenizer

In [32]:
regexTokenizer = RegexTokenizer(inputCol="Article", outputCol="words", pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [33]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

### Define Learning Model

In [34]:
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx, lr])

In [35]:
model = pipeline.fit(trainingData)


### Train the Model using Logistic Regression

In [36]:
lrdata = model.transform(trainingData).select("words","features","label","probability","prediction").show()

+--------------------+--------------------+-----+--------------------+----------+
|               words|            features|label|         probability|prediction|
+--------------------+--------------------+-----+--------------------+----------+
|[10, treasures, u...|(10000,[8,30,44,6...|  1.0|[0.01056133801902...|       1.0|
|[13, pop, rock, a...|(10000,[43,47,56,...|  1.0|[8.30472345846618...|       1.0|
|[13, pop, rock, a...|(10000,[30,53,73,...|  1.0|[0.00174489567721...|       1.0|
|[13, pop, rock, a...|(10000,[7,42,43,7...|  1.0|[0.00227054880856...|       1.0|
|[14, pop, rock, a...|(10000,[7,12,38,4...|  1.0|[6.75667054652320...|       1.0|
|[14, pop, rock, a...|(10000,[7,30,34,4...|  1.0|[0.00146082138661...|       1.0|
|[14, pop, rock, a...|(10000,[7,15,16,3...|  1.0|[0.00102699729954...|       1.0|
|[2018, n, f, l, d...|(10000,[7,26,50,5...|  2.0|[0.00484466911722...|       2.0|
|[28, years, after...|(10000,[47,55,90,...|  1.0|[0.03321213909407...|       1.0|
|[6, classical, 

### Perform Prediction on TEST Data

In [37]:
predictions = model.transform(testData)

In [38]:
predictions.filter(predictions['prediction'] == 1) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|  sports|(10000,[47,156,453,507,547,...|[0.2903560737770349,0.35099...|  2.0|       1.0|
|    arts|(10000,[7,23,157,176,360,37...|[0.28155074196766117,0.5410...|  1.0|       1.0|
|    arts|(10000,[47,157,171,222,246,...|[0.27310567965008115,0.3728...|  1.0|       1.0|
|    arts|(10000,[7,21,23,49,86,101,1...|[0.25400775422788713,0.6418...|  1.0|       1.0|
|politics|(10000,[42,44,47,65,77,78,8...|[0.2529387163677332,0.72040...|  0.0|       1.0|
|    arts|(10000,[86,120,181,223,263,...|[0.2148214822506446,0.52279...|  1.0|       1.0|
|    arts|(10000,[3,7,9,27,51,53,84,1...|[0.17019379982059185,0.7526...|  1.0|       1.0|
|    arts|(10000,[24,30,33,157,165,16...|[0.16436852041379732,0.6711...|  1.0|       1.0|
|    arts|

## Accuracy on Test data using Logistic Regression
#### Keep in mind that the model has not seen the documents in the test data set.

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9273862430286162

## Accuracy of UNKNOWN Data using Logistic Regression
#### Keep in mind that the model has not seen the documents in the Unknown data set.

In [40]:
predictions = model.transform(dfUN)
predictions.filter(predictions['prediction'] == 1) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|politics|(10000,[7,23,30,73,79,141,1...|[0.4035061967225257,0.49144...|  0.0|       1.0|
|    arts|(10000,[58,120,228,301,360,...|[0.1966187214074345,0.43803...|  1.0|       1.0|
|    arts|(10000,[47,102,135,157,196,...|[0.18654258052313855,0.5895...|  1.0|       1.0|
|    arts|(10000,[30,88,120,128,144,1...|[0.1757850896509791,0.65941...|  1.0|       1.0|
|    arts|(10000,[7,15,24,55,58,79,91...|[0.12225574536393546,0.7459...|  1.0|       1.0|
|    arts|(10000,[30,134,144,157,266,...|[0.11433150440957886,0.8018...|  1.0|       1.0|
|    arts|(10000,[15,30,53,70,132,157...|[0.09384819774133632,0.5764...|  1.0|       1.0|
|    arts|(10000,[1,30,68,70,88,102,1...|[0.08182519273160171,0.8120...|  1.0|       1.0|
|    arts|

0.8999603112974754

## CLASSIFICATION USING "NAIVE BAYES"
### RANDOM SPLIT DATA AGAIN
### DEFINE PIPELINE
### TRAIN MODEL

In [41]:
from pyspark.ml.classification import NaiveBayes
(trainingData2, testData2) = df.randomSplit([0.8, 0.2], seed = 231279)
nb = NaiveBayes(smoothing=1)
pipelinenb = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx, nb])

model = pipelinenb.fit(trainingData2)


In [42]:
nbdata = model.transform(trainingData).select("words","features","label","probability","prediction").show()

+--------------------+--------------------+-----+--------------------+----------+
|               words|            features|label|         probability|prediction|
+--------------------+--------------------+-----+--------------------+----------+
|[10, treasures, u...|(10000,[8,30,44,6...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[13, pop, rock, a...|(10000,[43,47,56,...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[13, pop, rock, a...|(10000,[30,53,73,...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[13, pop, rock, a...|(10000,[7,42,43,7...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[14, pop, rock, a...|(10000,[7,12,38,4...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[14, pop, rock, a...|(10000,[7,30,34,4...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[14, pop, rock, a...|(10000,[7,15,16,3...|  1.0|   [0.0,1.0,0.0,0.0]|       1.0|
|[2018, n, f, l, d...|(10000,[7,26,50,5...|  2.0|   [0.0,0.0,1.0,0.0]|       2.0|
|[28, years, after...|(10000,[47,55,90,...|  1.0|[8.33052300449372...|       1.0|
|[6, classical, 

### Perform Prediction of TEst Data using Model Trained on "Naive Bayes"

In [43]:
predictions = model.transform(testData2)
predictions.filter(predictions['prediction'] == 0) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|    arts|(10000,[15,73,152,193,228,2...|[1.0,2.3380926537918186E-47...|  1.0|       0.0|
|politics|(10000,[7,23,27,52,56,63,78...|[1.0,5.138151438246802E-93,...|  0.0|       0.0|
|politics|(10000,[52,70,120,132,159,2...|[1.0,2.0101768811056998E-10...|  0.0|       0.0|
|politics|(10000,[1,24,32,50,78,88,10...|[1.0,1.0021789169628446E-11...|  0.0|       0.0|
|politics|(10000,[20,21,72,86,91,132,...|[1.0,5.547498765932367E-137...|  0.0|       0.0|
|  sports|(10000,[62,70,132,152,201,2...|[1.0,1.6886373284652558E-15...|  2.0|       0.0|
|business|(10000,[52,55,63,65,70,141,...|[1.0,6.852633350625244E-166...|  3.0|       0.0|
|politics|(10000,[55,141,157,264,281,...|[1.0,3.319401254702701E-169...|  0.0|       0.0|
|politics|

## Accuracy of Test data Using "Naive Bayes"
#### Keep in mind that the model has not seen the documents in the test data set.

In [44]:
evaluatornb = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatornb.evaluate(predictions)

0.9278184165232357

## Accuracy of UNKNOWN data Using "Naive Bayes"
#### Keep in mind that the model has not seen the documents in the Unknown data set.

In [45]:
predictions = model.transform(dfUN)
predictions.filter(predictions['prediction'] == 0) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluatornb = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatornb.evaluate(predictions)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|    arts|(10000,[7,11,54,106,122,130...|[1.0,7.258982972147812E-35,...|  1.0|       0.0|
|politics|(10000,[11,24,32,42,80,141,...|[1.0,1.4634083509444678E-73...|  0.0|       0.0|
|  sports|(10000,[0,7,46,132,161,266,...|[1.0,9.616706063445867E-81,...|  2.0|       0.0|
|    arts|(10000,[36,70,125,196,221,2...|[1.0,8.005113533032365E-82,...|  1.0|       0.0|
|    arts|(10000,[23,42,157,160,209,2...|[1.0,3.2902687791451117E-90...|  1.0|       0.0|
|  sports|(10000,[120,128,132,196,229...|[1.0,3.1129315376896888E-11...|  2.0|       0.0|
|politics|(10000,[23,51,78,103,120,13...|[1.0,5.5326675419587386E-11...|  0.0|       0.0|
|  sports|(10000,[20,45,52,55,92,123,...|[1.0,3.400038547281304E-124...|  2.0|       0.0|
|politics|

0.8849339785122611

## Calculate ACCURACY using Cross Validation on Logistic Regression
Spark MLlib provides for cross-validation for hyperparameter tuning. Cross-validation attempts to fit the underlying estimator with user-specified combinations of parameters, cross-evaluate the fitted models, and output the best one.

In [46]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
pipelineCVLR = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx,cv])
cvModel = pipelineCVLR.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)


0.9032372171375949

## Accuracy of UNKNOWN Data using CV with Logistic Regression


In [47]:
predictions = cvModel.transform(dfUN)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9419612090569194

## CLASSIFICATION USING "RANDOM FOREST"

In [48]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
pipelineRF = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx, rf])

# Train model with Training Data
rfModel = pipelineRF.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|politics|(10000,[20,21,45,63,132,141...|[0.6187553063619479,0.11438...|  0.0|       0.0|
|politics|(10000,[1,7,42,49,51,52,94,...|[0.6108743839701696,0.10833...|  0.0|       0.0|
|politics|(10000,[42,45,55,58,71,77,8...|[0.5983538448520096,0.11701...|  0.0|       0.0|
|politics|(10000,[12,47,52,126,132,13...|[0.5828053952322988,0.14426...|  0.0|       0.0|
|politics|(10000,[42,44,47,49,52,55,6...|[0.5478062405177812,0.14397...|  0.0|       0.0|
|politics|(10000,[7,20,21,23,44,56,63...|[0.541161776471956,0.131416...|  0.0|       0.0|
|politics|(10000,[1,39,42,47,51,52,73...|[0.53255121178801,0.1584731...|  0.0|       0.0|
|politics|(10000,[0,21,30,45,107,161,...|[0.5227977989472073,0.14550...|  0.0|       0.0|
|politics|

## Accuracy  of Test Data using "Random Forest"


In [49]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8015466517231717

## Accuracy  of UNKNOWN Data using "Random Forest"


In [50]:
predictions = rfModel.transform(dfUN)
predictions.filter(predictions['prediction'] == 0) \
    .select("Category","features","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+--------+------------------------------+------------------------------+-----+----------+
|Category|                      features|                   probability|label|prediction|
+--------+------------------------------+------------------------------+-----+----------+
|politics|(10000,[0,15,20,24,56,70,73...|[0.6117351120799032,0.09268...|  0.0|       0.0|
|politics|(10000,[7,24,55,63,71,78,94...|[0.6008149929993282,0.12772...|  0.0|       0.0|
|politics|(10000,[47,63,87,90,104,132...|[0.5928923234387008,0.11564...|  0.0|       0.0|
|politics|(10000,[15,17,55,102,118,13...|[0.5859199394139557,0.12883...|  0.0|       0.0|
|politics|(10000,[11,42,47,52,63,132,...|[0.5819386388831426,0.15671...|  0.0|       0.0|
|politics|(10000,[1,12,20,22,63,68,73...|[0.5671795629428075,0.11578...|  0.0|       0.0|
|politics|(10000,[0,7,21,47,49,51,70,...|[0.5652136471056777,0.13302...|  0.0|       0.0|
|politics|(10000,[20,71,77,82,90,120,...|[0.5595822541513068,0.14237...|  0.0|       0.0|
|politics|

0.674496706476051

## ASSESSMENT:


#### Accuracy of Unknown Dataset on Various Models:
##### Logistic Regression: $0.918297808424174$
##### Naive Bayes : $0.9387861118473364$
##### Cross Validation Using Logistic Regression : $0.9796340896607091$
##### Random Forest : $0.8011736273456751$

#### Clearly the Cross Validation yeilds highest accuracy.
#### Random forest is not a good choice for high-dimensional sparse data.

### Conclusion: Logistic Regression Using Cross Validation is the best model in our analysis

