In [4]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=93df8ed0e75a90313a4dabf6409b0e7d3f61d8ab50e9edcb0d5c54047fdd96fc
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("mycourse").getOrCreate()

In [5]:
df = spark.read.csv('./course_file.csv', header = True)

In [6]:
df.show(2)

+-------------------+-------------+----------------+-------+-------+---------------+-------------+-------+--------+-----------+--------------------+--------+
|          date_time|       userid|          domain|dlbytes|ulbytes|       clientip|     serverip|country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+----------------+-------+-------+---------------+-------------+-------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180|England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-04 12:17:07|1886351675683|     hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|  Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|
+-------------------+-------------+----------------+-------+-------+---------------+-------------+-------+--------+-----------+--------------------+--------+
only showing top 2 rows



In [7]:
df.printSchema()

root
 |-- date_time: string (nullable = true)
 |-- userid: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- dlbytes: string (nullable = true)
 |-- ulbytes: string (nullable = true)
 |-- clientip: string (nullable = true)
 |-- serverip: string (nullable = true)
 |-- country: string (nullable = true)
 |-- txn_time: string (nullable = true)
 |-- http_method: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- platform: string (nullable = true)



In [8]:
df.describe().show()

+-------+-------------------+--------------------+-----------------+------------------+-----------------+-----------+------------+-------+------------------+-----------+--------------------+--------+
|summary|          date_time|              userid|           domain|           dlbytes|          ulbytes|   clientip|    serverip|country|          txn_time|http_method|          user_agent|platform|
+-------+-------------------+--------------------+-----------------+------------------+-----------------+-----------+------------+-------+------------------+-----------+--------------------+--------+
|  count|             100000|              100000|           100000|            100000|           100000|     100000|      100000| 100000|            100000|     100000|              100000|  100000|
|   mean|               NULL|5.010135665120539E12|             NULL|      499993.58573|     499925.16314|       NULL|        NULL|   NULL|1.4980519999999975|       NULL|                NULL|    NULL|


In [9]:
df.count()

100000

In [10]:
df.columns

['date_time',
 'userid',
 'domain',
 'dlbytes',
 'ulbytes',
 'clientip',
 'serverip',
 'country',
 'txn_time',
 'http_method',
 'user_agent',
 'platform']

In [11]:
df.dtypes

[('date_time', 'string'),
 ('userid', 'string'),
 ('domain', 'string'),
 ('dlbytes', 'string'),
 ('ulbytes', 'string'),
 ('clientip', 'string'),
 ('serverip', 'string'),
 ('country', 'string'),
 ('txn_time', 'string'),
 ('http_method', 'string'),
 ('user_agent', 'string'),
 ('platform', 'string')]

In [12]:
countries = df.select('country','dlbytes')

In [13]:
countries.show(2)

+-------+-------+
|country|dlbytes|
+-------+-------+
|England| 872807|
|  Wales|  50898|
+-------+-------+
only showing top 2 rows



In [14]:
from pyspark.sql.types import *
#Define custom schema
custom_schema = StructType([
    StructField("date_time", StringType(), True),
    StructField("userid", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("dlbytes", IntegerType(), True),
    StructField("ulbytes", IntegerType(), True),
    StructField("clientip", StringType(), True),
    StructField("serverip", StringType(), True),
    StructField("country", StringType(), True),
    StructField("txn_time", FloatType(), True),
    StructField("http_method", StringType(), True),
    StructField("user_agent", StringType(), True),
    StructField("platform", StringType(), True)
])

#Load the data with the custom schema
data_path = "/content/course_file.csv"
df = spark.read.csv(data_path, schema = custom_schema, header = True)

#Show the loaded DataFrame
df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/course_file.csv.

In [None]:
df.printSchema()

In [None]:
df.fillna(
    {
        'dlbytes' : 0,
        'ulbytes' : 0,
        'country' : 'Unknown'
    }
).show()

In [None]:
df.fillna('1').show()

In [None]:
df.dropna(subset = ['country']).show()

In [None]:
from pyspark.sql import functions as F

In [None]:
average = df.agg(F.avg('dlbytes')).collect()[0][0]

In [None]:
print(average)

In [None]:
df.fillna(
    {
        'dlbytes' : average,
        'ulbytes' : 0,
        'country' : 'Unknown'
    }
).show()

In [None]:
df.show(2)

In [None]:
df.createOrReplaceTempView("dfsql")

In [None]:
spark.sql("select country, count(*) from dfsql where dlbytes > 50000 group by country ").show()

In [None]:
grouped_df_1 = df.groupBy('country').agg(
    {
        'dlbytes': 'sum',
        'ulbytes': 'avg'
    }
)

grouped_df_1.withColumnRenamed('avg(ulbytes)','AvgUL').show()

In [None]:
from pyspark.sql.functions import *

df.groupBy('country', 'platform').agg(
    sum('dlbytes').alias('DL'),
    avg('ulbytes').alias('UL'),
    count('*').alias('row count')
).show()

In [None]:
df.withColumn('totalbytes', col('dlbytes') + col('ulbytes')).show(5)

In [None]:
df.withColumn('year', year('date_time')).show()

In [None]:
df.withColumn('txntime_milliseconds', col('txn_time') * 1000).show()

In [None]:
df.withColumn('device_type',
              when(
                  col('platform').isin('ios', 'Android'), \
                  'Mobile') \
              .otherwise('desktop')).show()

In [None]:
df = df.dropDuplicates(['userid'])

In [None]:
df.groupBy('userid').count().show()

In [None]:
df.write.csv('mycsvfile')

In [None]:
df = df.coalesce(1) #only one file

In [None]:
df.write.csv('mycsvfile2')

In [None]:
df.write.json('myjsonfile')

In [15]:
from pyspark.sql.functions import when

In [16]:
df.withColumn("caseexample", when(df["dlbytes"] > 500000, "large").otherwise("small")).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|caseexample|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|      large|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|      small|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|  

In [17]:
from pyspark.sql.functions import isnull

In [18]:
df.withColumn("nullexample", when(isnull(df["user_agent"]),1).otherwise(0)).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|nullexample|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|          0|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|          0|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|  

In [19]:
df.withColumn("transactionsize",

              when((df["dlbytes"] > 500000) & (df["ulbytes"] > 500000), "large")  \

              .when((df["dlbytes"] > 250000) & (df["ulbytes"] > 250000), "medium") \

              .when((df["dlbytes"] > 150000) & (df["ulbytes"] > 150000), "small")

    .otherwise("tiny")).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+---------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|transactionsize|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+---------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|          large|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|           tiny|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.17

In [20]:
df.withColumn("solution", when(df["txn_time"] > 1.6, "active").otherwise("inactive")).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|solution|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|  active|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|inactive|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|    1.32|       H

In [21]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, lead, lag, row_number, avg, sum

In [22]:
window_spec = Window.partitionBy("country").orderBy(df["dlbytes"].desc())

In [23]:
df.withColumn("rank", rank().over(window_spec)).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+-------+--------+-----------+--------------------+--------+----+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip|country|txn_time|http_method|          user_agent|platform|rank|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+-------+--------+-----------+--------------------+--------+----+
|2023-10-04 01:21:13|3144906514401|            page.com| 999946| 769910| 133.226.106.69|  43.150.184.67|  Wales|    2.73|       HTTP|Mozilla/5.0 (comp...| Windows|   1|
|2023-10-05 14:52:07|2736616479279|         watkins.com| 999912| 181045|  192.85.230.16|134.179.193.195|  Wales|    2.31|       HTTP|Mozilla/5.0 (comp...|     Mac|   2|
|2023-10-01 03:19:03|2236747991119|   edwards-patel.com| 999910| 283407|   78.178.15.26|   132.242.86.8|  Wales|    1.92|       HTTP|Mozilla/5.0 (iPod...| 

In [24]:
window_spec = Window.partitionBy("country").orderBy("date_time")

In [25]:
df.withColumn("row_number", row_number().over(window_spec)).show()

+-------------------+-------------+-------------------+-------+-------+---------------+---------------+-------+--------+-----------+--------------------+--------+----------+
|          date_time|       userid|             domain|dlbytes|ulbytes|       clientip|       serverip|country|txn_time|http_method|          user_agent|platform|row_number|
+-------------------+-------------+-------------------+-------+-------+---------------+---------------+-------+--------+-----------+--------------------+--------+----------+
|2023-10-01 00:00:01|9935183354270|   white-murray.biz| 196289| 359714|223.212.132.129| 204.235.216.11|  Wales|    1.72|       HTTP|Mozilla/5.0 (comp...|     Mac|         1|
|2023-10-01 00:00:01|0434470039855|    pritchard.co.uk| 350605| 860883|  70.236.57.196| 33.243.179.115|  Wales|     0.4|       HTTP|Mozilla/5.0 (Maci...| Android|         2|
|2023-10-01 00:00:37|0253415742384|      wheeler.co.uk| 613899|  56659| 137.71.136.215| 128.248.57.164|  Wales|    0.67|      HTTP

In [26]:
window_spec = Window.partitionBy().orderBy("date_time")

In [27]:
df.withColumn("next_bytes", lead("dlbytes").over(window_spec)).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+----------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|next_bytes|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+----------+
|2023-10-01 00:00:01|6242602789270|      ellis-hale.com|  60865| 378320|    65.219.1.13|      2.4.27.23| Ireland|    1.89|      HTTPS|Opera/9.61.(X11; ...|   Linux|    196289|
|2023-10-01 00:00:01|9935183354270|    white-murray.biz| 196289| 359714|223.212.132.129| 204.235.216.11|   Wales|    1.72|       HTTP|Mozilla/5.0 (comp...|     Mac|    350605|
|2023-10-01 00:00:01|0434470039855|     pritchard.co.uk| 350605| 860883|  70.236.57.196| 33.243.179.115|   Wales|     0.

In [28]:
window_spec = Window.partitionBy("userid")

In [29]:
df.withColumn("sumfunction", sum("dlbytes").over(window_spec)).show()

+-------------------+-------------+-------------------+-------+-------+---------------+--------------+--------+--------+-----------+--------------------+--------+-----------+
|          date_time|       userid|             domain|dlbytes|ulbytes|       clientip|      serverip| country|txn_time|http_method|          user_agent|platform|sumfunction|
+-------------------+-------------+-------------------+-------+-------+---------------+--------------+--------+--------+-----------+--------------------+--------+-----------+
|2023-10-01 11:05:24|0004910615244|         robson.org| 756778| 236550| 35.242.245.157| 85.188.139.63|Scotland|    0.83|      HTTPS|Opera/9.66.(Windo...|     iOS|  1172217.0|
|2023-10-05 11:36:59|0004910615244|         robson.org| 415439| 639504| 35.242.245.157| 85.188.139.63|Scotland|    2.37|       HTTP|Opera/9.66.(Windo...|     iOS|  1172217.0|
|2023-10-02 18:02:00|0007469702985|          smith.biz| 988049| 116727|  86.195.34.204|153.106.210.57| Ireland|    1.18|     

In [30]:
window_spec = Window.partitionBy("domain")

In [31]:
df.withColumn("avgbytes", avg("dlbytes").over(window_spec)).show()

+-------------------+-------------+------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------------+
|          date_time|       userid|            domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|         avgbytes|
+-------------------+-------------+------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------------+
|2023-10-03 12:31:47|3238665726626| adams-harding.com| 474109| 925181| 17.241.245.116|  75.184.220.53| Ireland|    1.15|       HTTP|Mozilla/5.0 (comp...| Android|         649536.0|
|2023-10-04 20:38:52|3238665726626| adams-harding.com| 824963| 350996| 17.241.245.116|  75.184.220.53| Ireland|    1.75|      HTTPS|Mozilla/5.0 (comp...|     Mac|         649536.0|
|2023-10-02 13:58:58|7615838967472|  adams-talbot.org| 996023| 196039|   83.227.84.18|  189.198

In [37]:
df.filter(   ~(df["country"] == 'Wales')   & ~(df["domain"] == "hopkins.org")  ).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9766845800247|           kelly.com| 212529| 811887|   13.144.79.35|  63.141.80.109| England|    2.58|      HTTPS|Opera/8.54.(X11; ...| Android|
|2023-10-0

In [40]:
countries = ["England", "Scotland"]

df.filter(df["country"].isin(countries)).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9766845800247|           kelly.com| 212529| 811887|   13.144.79.35|  63.141.80.109| England|    2.58|      HTTPS|Opera/8.54.(X11; ...| Android|
|2023-10-0

In [41]:
df.filter(df["country"].rlike("tland")).show()

+-------------------+-------------+--------------------+-------+-------+--------------+---------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|      clientip|       serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+--------------+---------------+--------+--------+-----------+--------------------+--------+
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420| 189.30.60.163|  68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 11:48:19|3400788003398|           watts.com| 342378| 715125| 142.37.27.131| 41.152.174.228|Scotland|    2.92|       HTTP|Opera/8.53.(Windo...|   Linux|
|2023-10-02 12:45:41|7007039082075|   kemp-robinson.org| 613214| 239494|80.245.249.129|198.186.190.149|Scotland|    1.78|      HTTPS|Mozilla/5.0 (X11;...|     Mac|
|2023-10-02 13:4

In [47]:
from pyspark.sql.functions import udf

In [48]:
from pyspark.sql.types import IntegerType

In [49]:
def calculate_total_bytes(dlbytes, ulbytes):
    return dlbytes + ulbytes

total_bytes_udf = udf(calculate_total_bytes, IntegerType())

df.withColumn("total_bytes", total_bytes_udf(df["dlbytes"], df["ulbytes"])).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|total_bytes|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|       NULL|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|       NULL|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236.18|Scotland|  

In [50]:
from pyspark.sql.types import StringType

In [55]:
def custom_text_processing(http_method):
    return http_method.lower()

custom_text_processing_udf = udf(custom_text_processing, StringType())

df.withColumn("processed_http", custom_text_processing_udf(df["http_method"])).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+--------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|processed_http|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+--------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|         https|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|          http|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|  68.171.236

In [61]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import round

In [62]:
def new_txn_time(txn_time):
    return txn_time*1000

new_txn_time_udf = udf(new_txn_time, DoubleType())

df.withColumn("txn_time_millisec", round(new_txn_time(df["txn_time"]),0)).show()

+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|       serverip| country|txn_time|http_method|          user_agent|platform|txn_time_millisec|
+-------------------+-------------+--------------------+-------+-------+---------------+---------------+--------+--------+-----------+--------------------+--------+-----------------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|  62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|           2330.0|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|  152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|           1200.0|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.1

In [68]:
from pyspark.sql.functions import year, month, dayofmonth, date_format

In [64]:
df2 = df.select("date_time")
df2.show()

+-------------------+
|          date_time|
+-------------------+
|2023-10-04 11:37:11|
|2023-10-04 12:17:07|
|2023-10-02 23:25:12|
|2023-10-03 09:43:11|
|2023-10-01 08:16:46|
|2023-10-01 13:49:39|
|2023-10-02 02:22:30|
|2023-10-03 11:48:19|
|2023-10-02 12:45:41|
|2023-10-03 08:59:16|
|2023-10-05 21:36:53|
|2023-10-02 03:26:59|
|2023-10-04 03:09:58|
|2023-10-02 10:16:25|
|2023-10-05 02:22:40|
|2023-10-01 18:12:17|
|2023-10-01 20:09:39|
|2023-10-03 04:45:13|
|2023-10-05 18:48:05|
|2023-10-02 13:47:58|
+-------------------+
only showing top 20 rows



In [70]:
df2.withColumn("year" , year(df2["date_time"]))\
.withColumn("month" , month(df2["date_time"]))\
.withColumn("day" , dayofmonth(df2["date_time"]))\
.withColumn("new_date", date_format(df2["date_time"], "yyyy-MM-dd"))\
.show()

+-------------------+----+-----+---+----------+
|          date_time|year|month|day|  new_date|
+-------------------+----+-----+---+----------+
|2023-10-04 11:37:11|2023|   10|  4|2023-10-04|
|2023-10-04 12:17:07|2023|   10|  4|2023-10-04|
|2023-10-02 23:25:12|2023|   10|  2|2023-10-02|
|2023-10-03 09:43:11|2023|   10|  3|2023-10-03|
|2023-10-01 08:16:46|2023|   10|  1|2023-10-01|
|2023-10-01 13:49:39|2023|   10|  1|2023-10-01|
|2023-10-02 02:22:30|2023|   10|  2|2023-10-02|
|2023-10-03 11:48:19|2023|   10|  3|2023-10-03|
|2023-10-02 12:45:41|2023|   10|  2|2023-10-02|
|2023-10-03 08:59:16|2023|   10|  3|2023-10-03|
|2023-10-05 21:36:53|2023|   10|  5|2023-10-05|
|2023-10-02 03:26:59|2023|   10|  2|2023-10-02|
|2023-10-04 03:09:58|2023|   10|  4|2023-10-04|
|2023-10-02 10:16:25|2023|   10|  2|2023-10-02|
|2023-10-05 02:22:40|2023|   10|  5|2023-10-05|
|2023-10-01 18:12:17|2023|   10|  1|2023-10-01|
|2023-10-01 20:09:39|2023|   10|  1|2023-10-01|
|2023-10-03 04:45:13|2023|   10|  3|2023

In [71]:
from pyspark.sql.functions import dayofweek, when

In [72]:
challenge = df.select("date_time")

In [75]:
step = challenge.withColumn("daynumber", dayofweek(challenge["date_time"]))

In [80]:
step.withColumn("weekend", when(step["daynumber"].between(2,6), "weekday").otherwise("weekend")).show()

+-------------------+---------+-------+
|          date_time|daynumber|weekend|
+-------------------+---------+-------+
|2023-10-04 11:37:11|        4|weekday|
|2023-10-04 12:17:07|        4|weekday|
|2023-10-02 23:25:12|        2|weekday|
|2023-10-03 09:43:11|        3|weekday|
|2023-10-01 08:16:46|        1|weekend|
|2023-10-01 13:49:39|        1|weekend|
|2023-10-02 02:22:30|        2|weekday|
|2023-10-03 11:48:19|        3|weekday|
|2023-10-02 12:45:41|        2|weekday|
|2023-10-03 08:59:16|        3|weekday|
|2023-10-05 21:36:53|        5|weekday|
|2023-10-02 03:26:59|        2|weekday|
|2023-10-04 03:09:58|        4|weekday|
|2023-10-02 10:16:25|        2|weekday|
|2023-10-05 02:22:40|        5|weekday|
|2023-10-01 18:12:17|        1|weekend|
|2023-10-01 20:09:39|        1|weekend|
|2023-10-03 04:45:13|        3|weekday|
|2023-10-05 18:48:05|        5|weekday|
|2023-10-02 13:47:58|        2|weekday|
+-------------------+---------+-------+
only showing top 20 rows



END