## 01-DataFrame_Basics

In [0]:
# 01-DataFrame_Basics
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExamples").getOrCreate()

In [0]:
# We can get data from a file or connect to a large distributed file like HDFS or S3 or from larger datasets on AWS EC2
df = spark.read.json("dbfs:/FileStore/tables/people.json")
df.printSchema()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
df.show()  # columns names will remain sorted in the output
df.display()  # download output in csv and along with data visualization

In [0]:
print(df.describe())
df.describe().show()   # Describe method only on numeric column
df.select('age').describe().show()
df.select('name').describe().show()

In [0]:
# Defining schema for appropriate datatypess assignments to the dataframe attributes
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
data_schema = [StructField("age", IntegerType(), True),
               StructField("name", StringType(), True),
               StructField("gender", StringType(), True)]
final_struc = StructType(fields = data_schema)   # Pass data_schema to StructType
df = spark.read.json("dbfs:/FileStore/tables/people.json", schema = final_struc)
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns))
df.printSchema()
df.show()  # attributes got arranged according to the order mentioned in the schema 

In [0]:
# Displaying data columnwise
print(df['age'])
print(type(df['age']))
print(type(df.select('age')))
df.select('age').show()  # To display the column content use select method
df.select('name', 'age').show()    # Multiple Columns - syntax-1
df.select(['age','gender']).show()   # Multiple Columns - syntax-2

In [0]:
# Splitting column content based on delimitor
# The split() is used to split a string column of the dataframe into multiple columns.
# This function is applied to the dataframe with the help of withColumn() and select().
from pyspark.sql.functions import split
df1 = df.withColumn('first_name', split(df['name'], " ").getItem(0)) \
        .withColumn('last_name', split(df['name'], " ").getItem(1))
df1.show()

In [0]:
# lit() is used to add a new column to the dataframe that contains literals or some constant value
from pyspark.sql.functions import lit, col
df2 = df.select(col("name"), lit("40 years").alias("expected_age"))
df2.show()

In [0]:
# Returns list of Row objects
print(df.head(2))
print(df.head(2)[0], df.head(2)[0]['name'])
print(df.tail(2))
print(df.first())

In [0]:
# Creating new columns: withColumn
df2 = df.withColumn('newage', df['age']) #.show()
df2.show()
df2.withColumn('double_age', df['age'] * 2).show()
df2.show()

In [0]:
# Rename column withColumnRenamed
df.withColumnRenamed('age', 'supernewage').show()

In [0]:
# Filtering with filter and where
df.filter(df['age'] == 30).show()
df.where(df['age'] == 30).show()

In [0]:
from pyspark.sql.functions import when
df.show()
# df.select("name", when(df.vitamins >= "25", "rich in vitamins")).show()
df.select("age", "name", "gender", when(df.gender == "Male", "Strong")).show()
df.select("age", "name", "gender", when(df.gender == "Male", "Strong").alias("expected_strength")).show()

In [0]:
from pyspark.sql.functions import when, col
df2 = df.withColumn("salutation", when(df.gender == "Male", "Mr.")
                                 .when(df.gender == "Female", "Ms.")
                                 .otherwise("Unknown"))
df2.show()

+----+------------+------+----------+
| age|        name|gender|salutation|
+----+------------+------+----------+
|null|Michael Ryan|  Male|       Mr.|
|  30|  Andy Jones|Female|       Ms.|
|  19| Justin Cook|  Male|       Mr.|
+----+------------+------+----------+



In [0]:
# Using UDF
import pyspark.sql.functions as F
from pyspark.sql.types import *
def personSalutation(gender):
    if gender == "Male": 
        return 'Mr.'
    else:
        return 'Ms.'
    
#convert to a UDF Function by passing in the function and return type of function
personSalutationUDF = F.udf(personSalutation, StringType())
df1 = df.withColumn("salutation", personSalutationUDF("gender"))
df1.show()

+----+------------+------+----------+
| age|        name|gender|salutation|
+----+------------+------+----------+
|null|Michael Ryan|  Male|       Mr.|
|  30|  Andy Jones|Female|       Ms.|
|  19| Justin Cook|  Male|       Mr.|
+----+------------+------+----------+



In [0]:
# Using SQL
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")
sql_results = spark.sql("SELECT * FROM people")
print(sql_results)
sql_results.show()
print(type(sql_results))
spark.sql("SELECT * FROM people WHERE age = 30").show()