In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
lines = spark.read.text('file:///home/talentum/test-jupyter/P2/M2/SM4/constitution.txt') 

# Loads text files and returns a DataFrame whose schema starts with a string 
# column named “value”, and followed by partitioned columns if there are any.

In [4]:
lines.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
lines.show(truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|We the People of the United States, in Order to form a more perfect      |
|Union, establish Justice, insure domestic Tranquility, provide for the   |
|common defence, promote the general Welfare, and secure the Blessings of |
|Liberty to ourselves and our Posterity, do ordain and establish this     |
|Constitution for the United States of America.                           |
|Article 1.  Section 1 All legislative Powers herein granted shall be     |
|vested in a Congress of the United States, which shall consist of a      |
|Senate and House of Representatives.  Section 2 The House of             |
|Representatives shall be composed of Members chosen every second Year by |
|the People of the several States, and the Electors in each State shall   |
|have the Qu

In [6]:
import pyspark.sql.functions as F

lines = lines.withColumn('splits', F.split(lines.value, ' '))

In [7]:
lines.show(truncate=False)

+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
|value                                                                    |splits                                                                                |
+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
|We the People of the United States, in Order to form a more perfect      |[We, the, People, of, the, United, States,, in, Order, to, form, a, more, perfect, ]  |
|Union, establish Justice, insure domestic Tranquility, provide for the   |[Union,, establish, Justice,, insure, domestic, Tranquility,, provide, for, the, ]    |
|common defence, promote the general Welfare, and secure the Blessings of |[common, defence,, promote, the, general, Welfare,, and, secure, the, Blessings, of, ]|
|Liberty to ourselves 

In [8]:
lines.printSchema()

root
 |-- value: string (nullable = true)
 |-- splits: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [9]:
lines.select('splits').show(truncate=False)

+--------------------------------------------------------------------------------------+
|splits                                                                                |
+--------------------------------------------------------------------------------------+
|[We, the, People, of, the, United, States,, in, Order, to, form, a, more, perfect, ]  |
|[Union,, establish, Justice,, insure, domestic, Tranquility,, provide, for, the, ]    |
|[common, defence,, promote, the, general, Welfare,, and, secure, the, Blessings, of, ]|
|[Liberty, to, ourselves, and, our, Posterity,, do, ordain, and, establish, this, ]    |
|[Constitution, for, the, United, States, of, America., , ]                            |
|[Article, 1., , Section, 1, All, legislative, Powers, herein, granted, shall, be, ]   |
|[vested, in, a, Congress, of, the, United, States,, which, shall, consist, of, a, ]   |
|[Senate, and, House, of, Representatives., , Section, 2, The, House, of, ]            |
|[Representatives, sh

In [10]:
lines = lines.select(F.explode(lines.splits).alias("word")) # Basically word tokenize in form of DataFrame

In [11]:
lines.printSchema()

root
 |-- word: string (nullable = true)



In [12]:
wordcount = lines.groupBy('word').count()

In [13]:
wordcount.show()

+------------+-----+
|        word|count|
+------------+-----+
|  Inhabitant|    3|
|       those|    6|
|Affirmation.|    1|
|        some|    1|
|     Office.|    2|
| Legislation|    1|
|    thereof.|    3|
|     highest|    3|
|      Union;|    1|
|       parts|    1|
|   construed|    4|
|  indictment|    1|
|      speedy|    1|
|jurisdiction|    3|
|    beverage|    1|
|        Day.|    1|
|        coin|    2|
|         be,|    1|
|       equal|    6|
|    Services|    1|
+------------+-----+
only showing top 20 rows

