In [3]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder.appName('mycourse').getOrCreate()

In [5]:
#define a custom schema
custome_schema = StructType([
    StructField('date_time', StringType(), True),
    StructField('userid', StringType(), True),
    StructField('domain', StringType(), True),
    StructField('dlbytes', IntegerType(), True),
    StructField('ulbytes', IntegerType(), True),
    StructField('clientip', StringType(), True),
    StructField('serverip', StringType(), True),
    StructField('country', StringType(), True),
    StructField('txn_time', FloatType(), True),
    StructField('http_method', StringType(), True),
    StructField('user_agent', StringType(), True),
    StructField('platform', StringType(), True)
])


# load the data with the custom schema
data_path = r'C:\Users\alex\Desktop\PySpark Crash Course Learn Spark Quickly\1 - Introduction\2 - course-file\course_file.csv'
df = spark.read.csv(data_path, schema=custome_schema, header=True)

# show the loaded Datafarme
df.show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9

# when() function

In [6]:
#df.withColumn('case_eaxample', f.when(df['dlbytes'] > 5000, 'large').otherwise('small')).show(5)
df.withColumn('case_example', f.when(f.col('dlbytes') > 500, 'large').otherwise('small')).show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|case_example|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|       large|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|       large|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland|    1.32|

# isnull() function

In [10]:
df.withColumn('isnull_example',f.when(f.isnull(f.col('user_agent')),1).otherwise('0')).show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+--------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|isnull_example|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+--------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|             0|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|             0|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland

# Multiple conditions

In [14]:
df.withColumn(
    'transaction_size', f.when((df['dlbytes'] > 500000) & (df['ulbytes'] > 500000), 'large')\
    .otherwise('small')
).show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+----------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|transaction_size|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+----------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|           large|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|           large|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.1

In [16]:
df.withColumn('trasaction_size',
    f.when((f.col('dlbytes') > 500000) & (f.col('ulbytes') > 500000), 'large')\
    .when((f.col('dlbytes') > 250000) & (f.col('ulbytes') > 250000), 'medium')\
    .when((f.col('dlbytes') > 150000) & (f.col('ulbytes') > 150000), 'small')\
    .otherwise('tiny')
).show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+---------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|trasaction_size|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+---------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|          large|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|           tiny|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Sco

In [20]:
# case challenge
df.withColumn('txn_time_activity',
    f.when(df['txn_time'] > 1.6, 'Active').otherwise('Inactive')
).show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+-----------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|txn_time_activity|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+-----------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|           Active|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|         Inactive|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.