# Prediction using PySpark

Referencec URLs:
* http://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html
* https://medium.com/@GalarnykMichael/install-spark-on-windows-pyspark-4498a5d8d66c [I didn't go this route]

In [12]:
# A SparkContext represents the connection to a Spark cluster, and can be used to create RDD and broadcast variables on that cluster.
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

try:
    sc.stop()
except NameError:
    print("sc not defined")

config = SparkConf().setMaster("local").setAppName("ClassifyUrl")    
sc = SparkContext(conf=config) #pyspark.SparkContext(appName="ClassifyUrl")
# sqlContext = SQLContext(sc)
# sc.stop() # Useful when you have to stop the context

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local") \
                    .appName("Classify Urls") \
                    .getOrCreate()
#                     .config("spark.some.config.option", "some-value") \


In [14]:
# from pyspark.sql import DataFrameReader

# spark.read.format("csv").option("header", "true").load("bigdata/train.csv")
train = spark.read.csv('bigdata/train.csv', header=True)

# Analyze Data types in dataset
train.printSchema()

root
 |-- Webpage_id: string (nullable = true)
 |-- Domain: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Tag: string (nullable = true)



In [15]:
train.head(5)

[Row(Webpage_id='1', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting', Tag='news'),
 Row(Webpage_id='2', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says', Tag='news'),
 Row(Webpage_id='3', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker', Tag='news'),
 Row(Webpage_id='4', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no', Tag='news'),
 Row(Webpage_id='5', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness', Tag='news')]

In [16]:
train.count()

53447

In [17]:
train.head(5)

[Row(Webpage_id='1', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting', Tag='news'),
 Row(Webpage_id='2', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says', Tag='news'),
 Row(Webpage_id='3', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker', Tag='news'),
 Row(Webpage_id='4', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no', Tag='news'),
 Row(Webpage_id='5', Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness', Tag='news')]

In [2]:
# OBJECTIVE : Create a DataFrame from RDD

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import SQLContext

# Your data at the moment
data = sc.parallelize([
    [('Id',1),('Name','Ganesh'),('Math',100),('Science',100),('Language',100)],
    [('Id',2),('Name','Karthik'),('Math',99),('Science',99),('Language',99)],
    [('Id',3),('Name','Kumar'),('Math',80),('Science',75),('Language',85)],
    [('Id',4),('Name','Anil'),('Math',80),('Science',55),('Language',65)],
    [('Id',5),('Name','Mamooty'),('Math',45),('Science',35),('Language',55)],
    [('Id',6),('Name','John'),('Math',65),('Science',75),('Language',95)],
    [('Id',7),('Name','Khan'),('Math',33),('Science',45),('Language',35)],
])

# Convert to tuple
data_converted = data.map(lambda x: (x[0][1], x[1][1], x[2][1], x[3][1], x[4][1]))
print(type(data_converted))

# Define schema
schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Math", IntegerType(), True),
    StructField("Science", IntegerType(), True),
    StructField("Language", IntegerType(), True)
])

# Create dataframe
# A SQLContext can be used create DataFrame, register DataFrame as tables, execute SQL over tables, cache tables, and read parquet files.
sqlContext = SQLContext(sc, sparkSession=spark)
df = sqlContext.createDataFrame(data_converted, schema)

# Output
df.show()

NameError: name 'sc' is not defined

In [None]:
# OBJECTIVE : Create a DataFrame from Tuples :

# List of Tuples
rows = [
    (1, 'Ganesh', 100, 100, 100),
    (2, 'Karthik', 99, 99, 99),
    (3, 'Kumar', 80, 75, 85),
    (4, 'Anil', 80, 55, 65),
    (5, 'Mamooty', 35, 100, 55),
    (6, 'John', 65, 75, 95),
    (7, 'Khan', 33, 45, 35),
]

column_names = ['Id', 'Name', 'Math', 'Science', 'Language']

df = spark.createDataFrame(rows , column_names)
df.printSchema()
df.show()

In [1]:
df.orderBy(df.Name).show() # Order by Name Ascending
df.orderBy(df.Name.desc()).show() # Order by Name Descending

''

In [None]:
# df.withColumn('Arts',col=[70,75,80,85,90,95,65]).show() # Add new column 'Arts'
# col has to be Column expression :(

df = df.withColumn('Total',col=(df.Math + df.Science + df.Language))
df.show()

import pyspark.sql.functions as func
df = df.withColumn('Overall Percent', func.round(df.Total/3, 2))
df.show()

df = df.drop('Overall Percent')
df.show()

df = df.withColumn('OverallPercentttt', func.round(df.Total/3, 2)) \
        .withColumnRenamed('OverallPercentttt', 'OverallPercent')
df.show()

In [None]:
# SQL Queries on DataFrame
# The lifetime of this temporary table is tied to the `SparkSession` that was used to create this `DataFrame`.
df.registerTempTable('df_table')
df2 = spark.sql('select * from df_table where OverallPercent > 75.0')
df2.show(5)

In [18]:
# Last Cell : Clean-up
sc.stop()

## Pandas Versus PySpark
Some differences between Pandas and PySpark DataFrame are:
    - Operation on Pyspark DataFrame run parallel on different nodes in cluster but, in case of pandas it is not possible.
    - Operations in PySpark DataFrame are lazy in nature but, in case of pandas we get the result as soon as we apply any operation.
    - In PySpark DataFrame, we can’t change the DataFrame due to it’s immutable property, we need to transform it. But in pandas it is not the case.
    - Pandas API support more operations than PySpark DataFrame. Still pandas API is more powerful than Spark.
    - Complex operations in pandas are easier to perform than Pyspark DataFrame
