In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.window as w
import pyspark.sql.types as t

In [2]:
spark = SparkSession.builder.appName('mycourse').getOrCreate()

In [3]:
#define a custom schema
custome_schema = t.StructType([
    t.StructField('date_time',   t.StringType(), True),
    t.StructField('userid',      t.StringType(), True),
    t.StructField('domain',      t.StringType(), True),
    t.StructField('dlbytes',     t.IntegerType(), True),
    t.StructField('ulbytes',     t.IntegerType(), True),
    t.StructField('clientip',    t.StringType(), True),
    t.StructField('serverip',    t.StringType(), True),
    t.StructField('country',     t.StringType(), True),
    t.StructField('txn_time',    t.FloatType(), True),
    t.StructField('http_method', t.StringType(), True),
    t.StructField('user_agent',  t.StringType(), True),
    t.StructField('platform',    t.StringType(), True)
])

# load the data with the custom schema
data_path = r'C:\Users\alex\Desktop\PySpark Crash Course Learn Spark Quickly\1 - Introduction\2 - course-file\course_file.csv'
df = spark.read.csv(data_path, schema=custome_schema, header=True)

# show the loaded Datafarme
df.show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9

In [4]:
df2 = df.select('date_time')
df2.show(5)

+-------------------+
|          date_time|
+-------------------+
|2023-10-04 11:37:11|
|2023-10-04 12:17:07|
|2023-10-02 23:25:12|
|2023-10-03 09:43:11|
|2023-10-01 08:16:46|
+-------------------+
only showing top 5 rows



In [5]:
df2.withColumn('year', f.year(df2['date_time'])).show(5)

+-------------------+----+
|          date_time|year|
+-------------------+----+
|2023-10-04 11:37:11|2023|
|2023-10-04 12:17:07|2023|
|2023-10-02 23:25:12|2023|
|2023-10-03 09:43:11|2023|
|2023-10-01 08:16:46|2023|
+-------------------+----+
only showing top 5 rows



In [7]:
df2.withColumn('year', f.year(df2['date_time']))\
    .withColumn('month', f.month(df2['date_time']))\
    .withColumn('day', f.day(f.col('date_time')))\
    .withColumn('new_date', f.date_format(f.col('date_time'), 'yyyy-MM-dd')).show(5)

+-------------------+----+-----+---+----------+
|          date_time|year|month|day|  new_date|
+-------------------+----+-----+---+----------+
|2023-10-04 11:37:11|2023|   10|  4|2023-10-04|
|2023-10-04 12:17:07|2023|   10|  4|2023-10-04|
|2023-10-02 23:25:12|2023|   10|  2|2023-10-02|
|2023-10-03 09:43:11|2023|   10|  3|2023-10-03|
|2023-10-01 08:16:46|2023|   10|  1|2023-10-01|
+-------------------+----+-----+---+----------+
only showing top 5 rows



In [8]:
df2.withColumn('day_of_week', f.dayofweek(df2['date_time']))\
    .withColumn(
        'is_weekend',
        f.when(f.dayofweek(df2['date_time']).isin([6,7]), 'weekend').otherwise('weekday')
    ).show()

+-------------------+-----------+----------+
|          date_time|day_of_week|is_weekend|
+-------------------+-----------+----------+
|2023-10-04 11:37:11|          4|   weekday|
|2023-10-04 12:17:07|          4|   weekday|
|2023-10-02 23:25:12|          2|   weekday|
|2023-10-03 09:43:11|          3|   weekday|
|2023-10-01 08:16:46|          1|   weekday|
|2023-10-01 13:49:39|          1|   weekday|
|2023-10-02 02:22:30|          2|   weekday|
|2023-10-03 11:48:19|          3|   weekday|
|2023-10-02 12:45:41|          2|   weekday|
|2023-10-03 08:59:16|          3|   weekday|
|2023-10-05 21:36:53|          5|   weekday|
|2023-10-02 03:26:59|          2|   weekday|
|2023-10-04 03:09:58|          4|   weekday|
|2023-10-02 10:16:25|          2|   weekday|
|2023-10-05 02:22:40|          5|   weekday|
|2023-10-01 18:12:17|          1|   weekday|
|2023-10-01 20:09:39|          1|   weekday|
|2023-10-03 04:45:13|          3|   weekday|
|2023-10-05 18:48:05|          5|   weekday|
|2023-10-0