In [1]:
import os
import pyspark
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
spark

In [2]:
from pyspark.sql.functions import *

In [3]:
df = spark.read.option("header", True).option("InferSchema", True).csv("flight-data/csv/2015-summary.csv").coalesce(5)

In [4]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [5]:
df.select(col("ORIGIN_COUNTRY_NAME").alias("Coutry")).distinct().toPandas()

Unnamed: 0,Coutry
0,Paraguay
1,Russia
2,Anguilla
3,Senegal
4,Sweden
...,...
120,Hungary
121,Pakistan
122,United Kingdom
123,Vietnam


In [6]:
df.limit(20).toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,15
1,United States,Croatia,1
2,United States,Ireland,344
3,Egypt,United States,15
4,United States,India,62
5,United States,Singapore,1
6,United States,Grenada,62
7,Costa Rica,United States,588
8,Senegal,United States,40
9,Moldova,United States,1


In [7]:
df.rdd.getNumPartitions()

1

In [8]:
df.repartition(5)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [9]:
df.rdd.getNumPartitions()

1

In [10]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [11]:
dateDF.toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,id,today,now
0,0,2023-03-02,2023-03-02 19:56:04.432
1,1,2023-03-02,2023-03-02 19:56:04.432
2,2,2023-03-02,2023-03-02 19:56:04.432
3,3,2023-03-02,2023-03-02 19:56:04.432
4,4,2023-03-02,2023-03-02 19:56:04.432
5,5,2023-03-02,2023-03-02 19:56:04.432
6,6,2023-03-02,2023-03-02 19:56:04.432
7,7,2023-03-02,2023-03-02 19:56:04.432
8,8,2023-03-02,2023-03-02 19:56:04.432
9,9,2023-03-02,2023-03-02 19:56:04.432


In [15]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"),
5)).toPandas()

Unnamed: 0,"date_sub(today, 5)","date_add(today, 5)"
0,2023-02-25,2023-03-07
1,2023-02-25,2023-03-07
2,2023-02-25,2023-03-07
3,2023-02-25,2023-03-07
4,2023-02-25,2023-03-07
5,2023-02-25,2023-03-07
6,2023-02-25,2023-03-07
7,2023-02-25,2023-03-07
8,2023-02-25,2023-03-07
9,2023-02-25,2023-03-07


In [16]:
df.toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,15
1,United States,Croatia,1
2,United States,Ireland,344
3,Egypt,United States,15
4,United States,India,62
...,...,...,...
251,United States,Saint Kitts and Nevis,145
252,Uruguay,United States,43
253,United States,Haiti,225
254,"Bonaire, Sint Eustatius, and Saba",United States,58


In [32]:
df.select(
    col("DEST_COUNTRY_NAME"),\
    split(\
        lower(\
            col("DEST_COUNTRY_NAME")
             ), " "
         ).alias("list")
).toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,list
0,United States,"[united, states]"
1,United States,"[united, states]"
2,United States,"[united, states]"
3,Egypt,[egypt]
4,United States,"[united, states]"
...,...,...
251,United States,"[united, states]"
252,Uruguay,[uruguay]
253,United States,"[united, states]"
254,"Bonaire, Sint Eustatius, and Saba","[bonaire,, sint, eustatius,, and, saba]"
