In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.window as w
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName('mycourse').getOrCreate()

In [3]:
#define a custom schema
custome_schema = StructType([
    StructField('date_time', StringType(), True),
    StructField('userid', StringType(), True),
    StructField('domain', StringType(), True),
    StructField('dlbytes', IntegerType(), True),
    StructField('ulbytes', IntegerType(), True),
    StructField('clientip', StringType(), True),
    StructField('serverip', StringType(), True),
    StructField('country', StringType(), True),
    StructField('txn_time', FloatType(), True),
    StructField('http_method', StringType(), True),
    StructField('user_agent', StringType(), True),
    StructField('platform', StringType(), True)
])


# load the data with the custom schema
data_path = r'C:\Users\alex\Desktop\PySpark Crash Course Learn Spark Quickly\1 - Introduction\2 - course-file\course_file.csv'
df = spark.read.csv(data_path, schema=custome_schema, header=True)

# show the loaded Datafarme
df.show(5)

+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|              domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+--------------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|    ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-04 12:17:07|1886351675683|         hopkins.org|  50898| 529504|  184.205.48.78|152.123.41.39|   Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|           evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9

In [8]:
df.filter((df['country'] == 'Wales') & (df['domain'] == 'hopkins.org')).show(5)

+-------------------+-------------+-----------+-------+-------+--------------+--------------+-------+--------+-----------+--------------------+--------+
|          date_time|       userid|     domain|dlbytes|ulbytes|      clientip|      serverip|country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+-----------+-------+-------+--------------+--------------+-------+--------+-----------+--------------------+--------+
|2023-10-04 12:17:07|1886351675683|hopkins.org|  50898| 529504| 184.205.48.78| 152.123.41.39|  Wales|     1.2|       HTTP|Mozilla/5.0 (Wind...| Android|
|2023-10-02 21:27:25|8517662001107|hopkins.org| 953556| 779944|191.186.99.253|175.253.185.17|  Wales|    2.63|       HTTP|Mozilla/5.0 (iPod...| Android|
|2023-10-04 08:17:21|1886351675683|hopkins.org| 614136| 800711| 184.205.48.78| 152.123.41.39|  Wales|    1.67|      HTTPS|Mozilla/5.0 (Wind...|   Linux|
+-------------------+-------------+-----------+-------+-------+--------------+----

In [11]:
df.filter(~(df['country'] == 'Wales') & (df['domain'] == 'hopkins.org')).show(5)

+-------------------+-------------+-----------+-------+-------+--------------+--------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|     domain|dlbytes|ulbytes|      clientip|      serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+-----------+-------+-------+--------------+--------------+--------+--------+-----------+--------------------+--------+
|2023-10-03 21:37:50|0712794005320|hopkins.org| 994213| 624035|  77.161.80.58| 11.235.43.205| Ireland|     2.9|      HTTPS|Mozilla/5.0 (Maci...| Android|
|2023-10-02 12:57:51|0172878252447|hopkins.org| 777168| 271098|  85.7.192.180|113.66.143.204|Scotland|    2.85|      HTTPS|Mozilla/5.0 (Maci...|     iOS|
|2023-10-03 20:33:59|0712794005320|hopkins.org| 627746| 960874|  77.161.80.58| 11.235.43.205|Scotland|    2.23|      HTTPS|Mozilla/5.0 (Maci...| Android|
|2023-10-01 20:05:38|0172878252447|hopkins.org| 380849| 170122|  85.7.192.18

In [12]:
countries = ['England', 'Scotland']
df.filter(df['country'].isin(countries)).show(5)

+-------------------+-------------+----------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|          domain|dlbytes|ulbytes|       clientip|     serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+----------------+-------+-------+---------------+-------------+--------+--------+-----------+--------------------+--------+
|2023-10-04 11:37:11|7773153683656|ryan-wells.co.uk| 872807| 741526|142.132.219.110|62.42.184.180| England|    2.33|      HTTPS|Mozilla/5.0 (Wind...| Android|
|2023-10-02 23:25:12|1597721345356|       evans.com| 964276| 952420|  189.30.60.163|68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 09:43:11|9766845800247|       kelly.com| 212529| 811887|   13.144.79.35|63.141.80.109| England|    2.58|      HTTPS|Opera/8.54.(X11; ...| Android|
|2023-10-01 13:49:39|8949163845658|        low

In [13]:
# like operator in sql --> search for scotland
df.filter(df['country'].rlike('cotland')).show(5)

+-------------------+-------------+------------------+-------+-------+--------------+---------------+--------+--------+-----------+--------------------+--------+
|          date_time|       userid|            domain|dlbytes|ulbytes|      clientip|       serverip| country|txn_time|http_method|          user_agent|platform|
+-------------------+-------------+------------------+-------+-------+--------------+---------------+--------+--------+-----------+--------------------+--------+
|2023-10-02 23:25:12|1597721345356|         evans.com| 964276| 952420| 189.30.60.163|  68.171.236.18|Scotland|    1.32|       HTTP|Mozilla/5.0 (Wind...|   Linux|
|2023-10-03 11:48:19|3400788003398|         watts.com| 342378| 715125| 142.37.27.131| 41.152.174.228|Scotland|    2.92|       HTTP|Opera/8.53.(Windo...|   Linux|
|2023-10-02 12:45:41|7007039082075| kemp-robinson.org| 613214| 239494|80.245.249.129|198.186.190.149|Scotland|    1.78|      HTTPS|Mozilla/5.0 (X11;...|     Mac|
|2023-10-02 13:47:58|0150443