In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[*]") \
        .getOrCreate()

In [4]:
import pandas as pd
from pyspark.sql import functions as F

In [5]:
path = "data-files/customers-tab-delimited/part-m-00000"

In [10]:
from pyspark.sql import types

schema = types.StructType([
types.StructField('id', types.IntegerType(), True),
types.StructField('first_name', types.StringType(), True), 
types.StructField('last_name', types.StringType(), True), 
types.StructField('email', types.StringType(), True), 
types.StructField('password', types.StringType(), True),
types.StructField('street', types.StringType(), True),
types.StructField('city', types.StringType(), True), 
types.StructField('state', types.StringType(), True),
types.StructField('zipcode', types.IntegerType(), True)

])

In [11]:
df =  spark.read.format('csv')\
.schema(schema)\
.option('sep', r'\t')\
.load(path)

In [12]:
df.schema

StructType([StructField('id', IntegerType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('email', StringType(), True), StructField('password', StringType(), True), StructField('street', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), True), StructField('zipcode', IntegerType(), True)])

In [13]:
result = df.filter(df.city == "Caguas")

In [14]:
result.show()

+---+----------+---------+---------+---------+--------------------+------+-----+-------+
| id|first_name|last_name|    email| password|              street|  city|state|zipcode|
+---+----------+---------+---------+---------+--------------------+------+-----+-------+
|  3|       Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|Caguas|   PR|    725|
|  5|    Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|Caguas|   PR|    725|
|  7|   Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|9453 High Concession|Caguas|   PR|    725|
|  9|      Mary|    Perez|XXXXXXXXX|XXXXXXXXX| 3616 Quaking Street|Caguas|   PR|    725|
| 11|      Mary|  Huffman|XXXXXXXXX|XXXXXXXXX|    3169 Stony Woods|Caguas|   PR|    725|
| 13|      Mary|  Baldwin|XXXXXXXXX|XXXXXXXXX|7922 Iron Oak Gar...|Caguas|   PR|    725|
| 16|   Tiffany|    Smith|XXXXXXXXX|XXXXXXXXX|      6651 Iron Port|Caguas|   PR|    725|
| 19| Stephanie| Mitchell|XXXXXXXXX|XXXXXXXXX|3543 Red Treasure...|Caguas|   PR|    725|
| 21|   William|Zimme

In [15]:
result.write.mode('Overwrite') \
.option("compression", "snappy") \
.orc('result/scenario3/solution')

In [16]:
cust_orc =   spark.read.format('orc')\
                .load('result/scenario3/solution/*.orc')

In [17]:
cust_orc.show()

+---+----------+---------+---------+---------+--------------------+------+-----+-------+
| id|first_name|last_name|    email| password|              street|  city|state|zipcode|
+---+----------+---------+---------+---------+--------------------+------+-----+-------+
|  3|       Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|Caguas|   PR|    725|
|  5|    Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|Caguas|   PR|    725|
|  7|   Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|9453 High Concession|Caguas|   PR|    725|
|  9|      Mary|    Perez|XXXXXXXXX|XXXXXXXXX| 3616 Quaking Street|Caguas|   PR|    725|
| 11|      Mary|  Huffman|XXXXXXXXX|XXXXXXXXX|    3169 Stony Woods|Caguas|   PR|    725|
| 13|      Mary|  Baldwin|XXXXXXXXX|XXXXXXXXX|7922 Iron Oak Gar...|Caguas|   PR|    725|
| 16|   Tiffany|    Smith|XXXXXXXXX|XXXXXXXXX|      6651 Iron Port|Caguas|   PR|    725|
| 19| Stephanie| Mitchell|XXXXXXXXX|XXXXXXXXX|3543 Red Treasure...|Caguas|   PR|    725|
| 21|   William|Zimme