In [1]:
#import libraries 
import pandas as pd
import numpy as np

In [2]:
import findspark
findspark.init()

In [3]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "turbidity_df6.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("turbidity_df6.csv"), header=True, inferSchema=True)

# Show DataFrame
df.show()

+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+
|      Date|    LKSPOMET_ATemp|    LKSPOMET_TotPrcp|      LKSBAWQ_Temp|        LKSBAWQ_Sal|     LKSBAWQ_Depth|        LKSBAWQ_pH|      LKSBAWQ_Turb|Turbidity_Range|
+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+
|05/11/2022|           8.36875| 0.36874999999999997|12.458333333333334|0.10000000000000002|           1.21625| 7.706250000000001|           11.8125|          </=20|
|05/12/2022|10.997916666666667| 0.18020833333333333|11.485416666666666|0.10000000000000002|           1.15375|            7.6125|19.645833333333332|          </=20|
|05/13/2022|11.973958333333334| 0.06041666666666667|11.833333333333334|0.10000000000000002|1.1392708333333335| 7.602083333333333|18.802083333333332|          </=20|
|05/14/201

In [5]:
from pyspark.sql.functions import split
split_col = split(df['Date'],'/')
df = df.withColumn('Month', split_col.getItem(0))
df = df.withColumn('Day', split_col.getItem(1))
df = df.withColumn('Year', split_col.getItem(2))
df.show()

+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----+---+----+
|      Date|    LKSPOMET_ATemp|    LKSPOMET_TotPrcp|      LKSBAWQ_Temp|        LKSBAWQ_Sal|     LKSBAWQ_Depth|        LKSBAWQ_pH|      LKSBAWQ_Turb|Turbidity_Range|Month|Day|Year|
+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----+---+----+
|05/11/2022|           8.36875| 0.36874999999999997|12.458333333333334|0.10000000000000002|           1.21625| 7.706250000000001|           11.8125|          </=20|   05| 11|2022|
|05/12/2022|10.997916666666667| 0.18020833333333333|11.485416666666666|0.10000000000000002|           1.15375|            7.6125|19.645833333333332|          </=20|   05| 12|2022|
|05/13/2022|11.973958333333334| 0.06041666666666667|11.833333333333334|0.10000000000000002|1.1392708

In [6]:
from pyspark.sql.functions import col
df.select(col('Month')).show()

+-----+
|Month|
+-----+
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
+-----+
only showing top 20 rows



In [7]:
#Check the new column data type
df.schema['Month'].dataType

StringType()

In [8]:
import pyspark.sql.functions as F
#summer_values = ['   06','   07','   08']
#summer_df = df.filter(df.Month.isin(summer_values))
#summer_df.show()

#july_df = df.filter(F.col("Month") == '06') 
#july_df.show()

summer_values = ['06','07','08']
summer_df = df.filter(df.Month.isin(summer_values))
summer_df.show()

+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----+---+----+
|      Date|    LKSPOMET_ATemp|    LKSPOMET_TotPrcp|      LKSBAWQ_Temp|        LKSBAWQ_Sal|     LKSBAWQ_Depth|        LKSBAWQ_pH|      LKSBAWQ_Turb|Turbidity_Range|Month|Day|Year|
+----------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----+---+----+
|06/01/2018| 7.012903225806452|                 0.0|16.662365591397847|                0.1|1.6734408602150537| 7.896774193548387|14.473118279569892|          </=20|   06| 01|2018|
|06/01/2019|12.393749999999999|                 0.0|15.069791666666667|0.10000000000000002|         2.0665625| 7.795833333333333|             8.875|          </=10|   06| 01|2019|
|06/01/2022|14.470833333333333|                 0.0|14.061458333333334|0.10000000000000002|1.2292708

In [9]:
summer_df.toPandas().to_csv('summer_turbidity_6.csv')