In [1]:
#import libraries 
import pandas as pd
import numpy as np

In [2]:
import findspark
findspark.init()

In [3]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "turbidity_df1.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("turbidity_df1.csv"), header=True, inferSchema=True)

# Show DataFrame
df.show()

+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+
|  _c0|      Date|               Time|LKSPOMET_ATemp|LKSPOMET_F_ATemp|LKSPOMET_TotPrcp|LKSPOMET_F_TotPrcp|LKSBAWQ_Temp|LKSBAWQ_F_Temp|LKSBAWQ_Sal|LKSBAWQ_F_Sal|LKSBAWQ_Depth|LKSBAWQ_F_Depth|LKSBAWQ_pH|LKSBAWQ_F_pH|LKSBAWQ_Turb|LKSBAWQ_F_Turb|Turbidity_Range|
+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+
|14454|05/31/2018|2023-06-04 13:00:00|          23.9|            <0> |             0.0|              <0> |        17.2|          <0> |        0.1|         <0> |         1.49|           <0> |       7.8|        <0> |        1

In [5]:
from pyspark.sql.functions import split
split_col = split(df['Date'],'/')
df = df.withColumn('Month', split_col.getItem(0))
df = df.withColumn('Day', split_col.getItem(1))
df = df.withColumn('Year', split_col.getItem(2))
df.show()

+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+-----+---+----+
|  _c0|      Date|               Time|LKSPOMET_ATemp|LKSPOMET_F_ATemp|LKSPOMET_TotPrcp|LKSPOMET_F_TotPrcp|LKSBAWQ_Temp|LKSBAWQ_F_Temp|LKSBAWQ_Sal|LKSBAWQ_F_Sal|LKSBAWQ_Depth|LKSBAWQ_F_Depth|LKSBAWQ_pH|LKSBAWQ_F_pH|LKSBAWQ_Turb|LKSBAWQ_F_Turb|Turbidity_Range|Month|Day|Year|
+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+-----+---+----+
|14454|05/31/2018|2023-06-04 13:00:00|          23.9|            <0> |             0.0|              <0> |        17.2|          <0> |        0.1|         <0> |         1.49|    

In [6]:
from pyspark.sql.functions import col
df.select(col('Month')).show()

+-----+
|Month|
+-----+
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
|   05|
+-----+
only showing top 20 rows



In [7]:
#Check the new column data type
df.schema['Month'].dataType

StringType()

In [9]:
import pyspark.sql.functions as F
#summer_values = ['   06','   07','   08']
#summer_df = df.filter(df.Month.isin(summer_values))
#summer_df.show()

#july_df = df.filter(F.col("Month") == '06') 
#july_df.show()

summer_values = ['06','07','08']
summer_df = df.filter(df.Month.isin(summer_values))
summer_df.show()

+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+-----+---+----+
|  _c0|      Date|               Time|LKSPOMET_ATemp|LKSPOMET_F_ATemp|LKSPOMET_TotPrcp|LKSPOMET_F_TotPrcp|LKSBAWQ_Temp|LKSBAWQ_F_Temp|LKSBAWQ_Sal|LKSBAWQ_F_Sal|LKSBAWQ_Depth|LKSBAWQ_F_Depth|LKSBAWQ_pH|LKSBAWQ_F_pH|LKSBAWQ_Turb|LKSBAWQ_F_Turb|Turbidity_Range|Month|Day|Year|
+-----+----------+-------------------+--------------+----------------+----------------+------------------+------------+--------------+-----------+-------------+-------------+---------------+----------+------------+------------+--------------+---------------+-----+---+----+
|14498|06/01/2018|2023-06-04 00:00:00|          10.3|            <0> |             0.0|              <0> |        18.0|          <0> |        0.1|         <0> |         1.67|    

In [10]:
summer_df.toPandas().to_csv('summer_turbidity_4.csv')

  series = series.astype(t, copy=False)
