## 02-DataFrame_Basic_Operations_Filtering_Data

In [0]:
# 02-DataFrame_Basic_Operations_Filtering_Data
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExamples").getOrCreate()

In [0]:
# Let Spark know about the header and infer the Schema types! 
# Infer scehma could be done with .csv file not with .json files
df = spark.read.csv('dbfs:/FileStore/tables/appl_stock.csv', inferSchema = True, header = True)
df.printSchema()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
print(df.head(2))
df.show(2)

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)

DataFrame columns are: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'] with column count: 7 and with row count: 1762
[Row(Date=datetime.date(2010, 1, 4), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039), Row(Date=datetime.date(2010, 1, 5), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]
+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213

In [0]:
# Filtering Data
df.filter("Close < 500").show(2)
df.filter("Close < 500").select(['Open', 'Close']).show(2)  # Syntax-1
df.filter("Close < 500").select('Open', 'Close').show(2)    # Syntax-2

+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+----------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows

+----------+----------+
|      Open|     Close|
+----------+----------+
|213.429998|214.009998|
|214.599998|214.379993|
+----------+----------+
only showing top 2 rows

+----------+----------+
|      Open|     Close|
+----------+----------+
|213.429998|214.009998|
|214.599998|214.379993|
+----------+----------+
only showing top 2 rows



In [0]:
df.filter("close < 500").select('Open').show(2)  # conditions are case-insensitive
df.filter(df["Close"] < 200).show(2)
df.select('Open').where(df['close'] < 500).show(2)
df.where(df['close'] < 500).select('Open').show(2)

+----------+
|      Open|
+----------+
|213.429998|
|214.599998|
+----------+
only showing top 2 rows

+----------+------------------+----------+----------+----------+---------+------------------+
|      Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
+----------+------------------+----------+----------+----------+---------+------------------+
only showing top 2 rows

+----------+
|      Open|
+----------+
|213.429998|
|214.599998|
+----------+
only showing top 2 rows

+----------+
|      Open|
+----------+
|213.429998|
|214.599998|
+----------+
only showing top 2 rows



In [0]:
df.filter(df["Low"] == 197.16).show()
df.where(df["Low"] == 197.16).show()

+----------+------------------+----------+------+------+---------+---------+
|      Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+----------+------------------+----------+------+------+---------+---------+
|2010-01-22|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+----------+------------------+----------+------+------+---------+---------+

+----------+------------------+----------+------+------+---------+---------+
|      Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+----------+------------------+----------+------+------+---------+---------+
|2010-01-22|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+----------+------------------+----------+------+------+---------+---------+



In [0]:
# Multiple condition filtering
df.filter((df["Close"] < 200) & (df['Open'] > 200) ).show(2)
df.filter((df["Close"] < 200) | (df['Open'] > 200) ).show(2)
# ~ Not operator
df.filter((df["Close"] < 200) & ~(df['Open'] < 200) ).show(2)

+----------+------------------+----------+----------+----------+---------+------------------+
|      Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
+----------+------------------+----------+----------+----------+---------+------------------+
only showing top 2 rows

+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994| 

In [0]:
# "show() to see the output <br>
# "collect()" to save the filtered output to save to a variable
# Collecting results as Python objects
result = df.filter(df["Low"] == 197.16).collect()
print(result)
print(type(result), type(result[0]))
row = result[0]
print(row)
print(result[0][2])

[Row(Date=datetime.date(2010, 1, 22), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]
<class 'list'> <class 'pyspark.sql.types.Row'>
Row(Date=datetime.date(2010, 1, 22), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)
207.499996


Rows can be called to turn into dictionaries

In [0]:
# Rows can be converted into dictionaries
print (len(row.asDict()))
print (row.asDict())
print (row.asDict()['Volume'])   # keys are case sensitive
for item in row.asDict().items():
    print("Key: <",item[0], "> and Value: <", item[1], ">")

7
{'Date': datetime.date(2010, 1, 22), 'Open': 206.78000600000001, 'High': 207.499996, 'Low': 197.16, 'Close': 197.75, 'Volume': 220441900, 'Adj Close': 25.620401}
220441900
Key: < Date > and Value: < 2010-01-22 >
Key: < Open > and Value: < 206.78000600000001 >
Key: < High > and Value: < 207.499996 >
Key: < Low > and Value: < 197.16 >
Key: < Close > and Value: < 197.75 >
Key: < Volume > and Value: < 220441900 >
Key: < Adj Close > and Value: < 25.620401 >
