In [None]:
# cd /opt/spark-3.0.0-bin-hadoop3.2/python
# python setup.py install
# Start jupyter notebook
# Ref: https://github.com/pinarersoy/PySpark_SparkSQL_MLib/blob/master/PySpark%20and%20SparkSQL.ipynb
# Data Source: Kaggle: https://www.kaggle.com/cmenca/new-york-times-hardcover-fiction-best-sellers

In [None]:
# 1.0 Run following command to transfer files
#     from localfile system to hadoop
hdfs dfs -put /home/ashok/Documents/spark/1.basics/nyt2.json  /user/ashok/

In [None]:
# 1.1 Start SparkSession and create SparkContext
#     (no need to execute if you started pyspark)
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [None]:
# 1.2 Import pyspark related modules
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
# 1.3 Other libraries
from datetime import date, timedelta, datetime
import pandas as pd
import time, os

In [None]:
# 1.4
os.chdir("/home/ashok/Documents/spark/1.basics")
os.listdir()
# 

In [None]:
# 1.4
help(SparkSession)

In [None]:
# 2.0 Session can be observed at http://localhost:4040
#     Tab name is myExpsts
#     (no need to execute if you started pyspark)
spark = SparkSession.builder \
                    .master("local") \
                    .appName("myExpts") \
                    .getOrCreate()
                    

In [None]:
# 2.1 Display outputs from multiple commands
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 3.0 REad json file
nyt = spark.read.json("/user/ashok/nyt2.json")
# Read TXT FILES
# dataframe_txt = sc.read.text('text_data.txt')

# Read CSV FILES
# dataframe_csv = sc.read.csv('csv_data.csv')

# Read PARQUET FILES
# dataframe_parquet = sc.read.load('parquet_data.parquet')

In [None]:
# 4.0 To avoid display of overlapping table in jupyter do this:
# 4.1. Go to: 
#           cd  /home/ashok/anaconda3/lib/python3.7/site-packages/notebook/static/style

# 4.2. Save file style.min.css as style.min.css.old
#    cp style.min.css style.min.css.old

# 4.3. Check:
#    ls style.min*

# 4.4. open file style.min.css:
#    leafpad style.min.css 
#    Search for white-space: pre-wrap;  
#    (possibly line numbers: 1568 11443 )
#    You will find it at two places. Commentout both the lines,as:
#        /* white-space: pre-wrap;   */
#    Save the file as style.min.css. 
# 4.5. Restart jupyter
#    See StackOverflow: https://stackoverflow.com/a/60295993
# 4.6. Else, use this command to see output vertically:
#      nyt.show(n=5, truncate=False, vertical=True)

nyt.show(3, True)

In [None]:
# 5.0 Some Dataframe commands
nyt.count()
nyt.describe().show()
nyt.summary().show()
nyt.printSchema()
nyt.columns
nyt.dtypes

In [None]:
# 5.1 Also there is orderBy() function
help(nyt.sort)

In [None]:
# 5.2 Sort on multiple columns:
nyt.sort(["author", "publisher"]).show(5)
# OR
nyt.sort(["author", desc("publisher")]).show(5)

In [None]:
# 5.3 Are there duplicate data?
nyt.sort("author").select("_id", "author", "title").show(10,False)

In [None]:
# 5.4 Are there duplicate rows ?
# Group by "title", "author"
grouped = nyt.groupby(["title","author"])     
grouped.count().sort( desc("count")).show()

In [None]:
# 5.5 Drop duplicates
nyt = nyt.drop_duplicates()
nyt.count()

In [None]:
nyt[nyt.author.isin(["Dan Brown", "Harper Lee"])].show(1)

In [None]:
# 6.0 Filter commands
nyt.where(col('author').isin(["Dan Brown", "Harper Lee"])).select("title").show()

In [None]:
# 6.1 Use OR not ||
nyt.where("author == 'Dan Brown'  OR author == 'Harper Lee' ").select("author").show()

In [None]:
# 6.2 USe of %like%
# https://stackoverflow.com/questions/41889974/filter-df-when-values-matches-part-of-a-string-in-pyspark
nyt.select("author").filter("author like   '%Har%'").show(3)
nyt.filter("author like '%Har%'").select("title").show(3)
nyt.filter("title startswith ")


In [None]:
# 6.3 A uniform way to code queries
nyt.filter(nyt.author.startswith("Har")).show(3)
nyt.filter(nyt.author.endswith("en")).show(3,False)
nyt.filter(nyt.author.like("%Har%")).show(3)
nyt.filter(nyt.author.isin(["Harlan Coben", "Dan Brown"])).show(3)


In [None]:
# 6.4 Filteration using 'select'
nyt.select("author", nyt.author.startswith("Har")).show(3)
nyt.select("author", nyt.author.endswith("en")).show(3,False)
nyt.select("author", nyt.author.like("%Har%")).show(3)
nyt.select("author", nyt.author.isin(["Harlan Coben", "Dan Brown"])).show(3)


In [None]:
# 7.0 
help(nyt.distinct)

In [None]:
# 7.1
nyt.select("author").distinct().show(3)

In [None]:
# 7.2 Implement pandas value_count()
grouped = nyt.groupby("author")
grouped.count().show()

In [None]:
# 8.0 Missing values
#     DataFrame.na. method has three methods:
#     na.drop, na.fill and na.replace
#     Methods are .drop() : 
#                        .drop('any'): Drop a row if 'any' field value is NULL
#                        .drop('all'): Drop a row if 'all' field values are NULL
#                        .drop(thresh = 5) : Drop a row if at least 5 field values are NULL
#                        .drop('any'/'all'/thresh, subset = ['col1', 'col2']): Consider this
#                                       column subset for 'any'/'all'/thresh behaviour
#               .fill(value): Replace this value everywhere
#               .fill(value, subset = ['col1', 'col2']) : Replace value in just these columns
#.              .fill({'col1': val1, 'col2' : val2}): Fill null, as here
#               .replace(what-to-replace,replace_with,subset)
#                   what-to-replace: Value to be replaced. Can be dict {'value': its-replacement}
#                                 or a list [23,43] ie replace 23 and 43
#                   replace-with: Replace value(s) with what. Can be a list [21,41]
#                   subset: ['col1', 'col2']: Consider these columns
help(nyt.na)

# NaN vs Null in Spark
In Spark a null is missing value--or something empty. NaN is, however, not a number such as division by 0. Thus NaN may generally occur when a column has a float or double type. It can occur in any type of column. However, a string such as: "" is not a missing value.

In [None]:
# 8.1 Handling missing values
#     First create a dataframe

import numpy as np
from pyspark.sql import *

row1 =  Row(age = 10,    height = 10.1,           income = 45.9,     name =  "aAlice")
row2 =  Row(age = 83,    height = None,           income = 45.9,     name =  "bAlice")
row3 =  Row(age = 30,    height = None,           income = np.nan,   name =  "cAlice")
row4 =  Row(age = 83,    height = float(83.1),    income = 45.9,     name =  "dAlice")
row5 =  Row(age = None,  height = None,           income = 45.9,     name =   None   )
row6 =  Row(age = None,  height = float(84.2),    income = 45.9,     name =   None   )
row7 =  Row(age = 23,    height = None,           income = np.nan,   name =   None   )
row8 =  Row(age = 11,    height = float(45.3),    income = np.nan,   name =  "eAlice")
row8 =  Row(age = 12,    height = None,           income = 45.9,     name =  "fAlice")
row9 =  Row(age = 33,    height = float(np.nan),  income = 45.9,     name =  None    )
row10 = Row(age = 30,    height = float(np.nan),  income = 45.9,     name =  None    )
row11 = Row(age = 33,    height = float(33.0),    income = 45.9,     name =  ""      )  # Sting ""

# 8.2 Collect these rows to create a DataFrame
df = spark.createDataFrame([row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11])
df.show()

In [None]:
# 8.3 Using fill
#     Only two columns are filled int
df.na.fill(18).show()

# 8.4 Only name column will be filled in
#     Note, however, the earlier value
#     of "" does not get filled in
df.na.fill("sunder").show()

In [None]:
# 8.5 Fill only 'age'
df.na.fill(18, ['age']).show()

# 8.6 Fill 'age' and 'height' with different values
df.na.fill({'age': 18, 'height': 87.5}).show()


In [None]:
# 9.0 Using replace:
df.show()

# 9.1 Replace 10 by 19 everywhere 
df.na.replace(10,19).show()

# 9.2 Replace 10 by 19 only in 'age'
#     column
df.na.replace(10,19,'age').show()

In [None]:
# 9.3 Replace 10 by 19 and 83 by 89 everywhere
df.na.replace([10,83], [19,89]).show()

# 9.4 Replace 10 by 19 and 83 by 89 everywhere
df.na.replace([10,83], [19,89], 'age').show()

# 9.5 Same as above
df.na.replace({10 : 19, 83 : 89 }, 'age').show()

In [None]:
from pyspark.sql.functions import when, count, col
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
####### I am done  #######