In [1]:
import findspark
findspark.init('/usr/local/spark/')

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col

In [4]:
spark = SparkSession.builder\
            .appName('Read Hdp Write Hive').master('local[*]')\
            .enableHiveSupport()\
            .getOrCreate()

In [6]:
txnSchema = StructType([
                        StructField('txnno', IntegerType()),
                        StructField('txndate', StringType()),
                        StructField('custno', IntegerType()),
                        StructField('amount', DoubleType()),
                        StructField('category', StringType()),
                        StructField('product', StringType()),
                        StructField('city', StringType()),
                        StructField('state', StringType()),
                        StructField('spendby', StringType())
                        ])

In [7]:
ustxnDf = spark.read.format('csv')\
                .option('header','true')\
                .schema(txnSchema) \
                .load('hdfs://localhost:54310/user/hduser/HFS/Input/us_txns')
ustxnDf.show(5, truncate=False)

+-----+----------+-------+------+------------------+---------------------------------+-----------+----------+-------+
|txnno|txndate   |custno |amount|category          |product                          |city       |state     |spendby|
+-----+----------+-------+------+------------------+---------------------------------+-----------+----------+-------+
|0    |06-26-2011|4007024|40.33 |Exercise & Fitness|Cardio Machine Accessories       |Clarksville|Tennessee |credit |
|1    |05-26-2011|4006742|198.44|Exercise & Fitness|Weightlifting Gloves             |Long Beach |California|credit |
|2    |06-01-2011|4009775|5.58  |Exercise & Fitness|Weightlifting Machine Accessories|Anaheim    |California|credit |
|3    |06-05-2011|4002199|198.19|Gymnastics        |Gymnastics Rings                 |Milwaukee  |Wisconsin |credit |
|4    |12-17-2011|4002613|98.81 |Team Sports       |Field Hockey                     |Nashville  |Tennessee |credit |
+-----+----------+-------+------+------------------+----

In [8]:
# Before Imposing Schema:
ustxnDf.printSchema()

root
 |-- txnno: integer (nullable = true)
 |-- txndate: string (nullable = true)
 |-- custno: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- spendby: string (nullable = true)



In [9]:
# After Imposing Schema:
ustxnDf.printSchema()

root
 |-- txnno: integer (nullable = true)
 |-- txndate: string (nullable = true)
 |-- custno: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- spendby: string (nullable = true)



# Filter Condition: Exercise & Fitness and Clarksville

In [11]:
# DSL:
# filCredFitnessDf = ustxnDf.filter((col('category') == 'Exercise & Fitness') &
#                                     (col('city') == 'Clarksville'))
# filCredFitnessDf.show(5)

# Spark SQL:
ustxnDf.createOrReplaceTempView('txn')
resDf = spark.sql("""select * from txn 
                        where category = 'Exercise & Fitness'
                        and city = 'Clarksville'
                """)
resDf.show()

+-----+----------+-------+------+------------------+--------------------+-----------+---------+-------+
|txnno|   txndate| custno|amount|          category|             product|       city|    state|spendby|
+-----+----------+-------+------+------------------+--------------------+-----------+---------+-------+
|    0|06-26-2011|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville|Tennessee| credit|
|   84|05-31-2011|4000293| 49.97|Exercise & Fitness|         Stopwatches|Clarksville|Tennessee|   cash|
|  907|08-02-2011|4000899|141.76|Exercise & Fitness|      Yoga & Pilates|Clarksville|Tennessee| credit|
| 1729|05-07-2011|4009949|104.96|Exercise & Fitness|        Free Weights|Clarksville|Tennessee| credit|
| 3486|08-14-2011|4003437|  75.3|Exercise & Fitness|Weightlifting Mac...|Clarksville|Tennessee| credit|
| 3895|06-27-2011|4005463|156.13|Exercise & Fitness|      Exercise Bands|Clarksville|Tennessee| credit|
| 4576|12-11-2011|4003900|170.73|Exercise & Fitness|      Yoga &

In [5]:
spark.sql('show databases').show()

+------------+
|databaseName|
+------------+
|     default|
|     saif_db|
+------------+



In [15]:
spark.sql('use saif_db')
spark.sql('show tables').show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| saif_db|             emp_mgd|      false|
| saif_db|partition_emp_static|      false|
| saif_db|partition_emp_sta...|      false|
|        |                 txn|       true|
+--------+--------------------+-----------+



In [16]:
resDf.write.format('csv').saveAsTable('saif_db.txnTbl')

In [17]:
spark.sql('show tables').show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| saif_db|             emp_mgd|      false|
| saif_db|partition_emp_static|      false|
| saif_db|partition_emp_sta...|      false|
| saif_db|              txntbl|      false|
|        |                 txn|       true|
+--------+--------------------+-----------+



In [18]:
spark.sql('select * from txntbl').show(5)

+-----+----------+-------+------+------------------+--------------------+-----------+---------+-------+
|txnno|   txndate| custno|amount|          category|             product|       city|    state|spendby|
+-----+----------+-------+------+------------------+--------------------+-----------+---------+-------+
|    0|06-26-2011|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville|Tennessee| credit|
|   84|05-31-2011|4000293| 49.97|Exercise & Fitness|         Stopwatches|Clarksville|Tennessee|   cash|
|  907|08-02-2011|4000899|141.76|Exercise & Fitness|      Yoga & Pilates|Clarksville|Tennessee| credit|
| 1729|05-07-2011|4009949|104.96|Exercise & Fitness|        Free Weights|Clarksville|Tennessee| credit|
| 3486|08-14-2011|4003437|  75.3|Exercise & Fitness|Weightlifting Mac...|Clarksville|Tennessee| credit|
+-----+----------+-------+------+------------------+--------------------+-----------+---------+-------+
only showing top 5 rows

