## Getting Started with PySpark and RDDs

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Getting Started with PySpark") \
    .master("local") \
    .getOrCreate()

In [3]:
spark

### Understanding Resilient Distributed Datasets


In [4]:
nums = [1,2,3,4,5,6,7]
num_rdd = spark.sparkContext.parallelize(nums)

In [6]:
num_rdd.collect()

[1, 2, 3, 4, 5, 6, 7]

In [17]:
minus = num_rdd.map(lambda x: x-1)

In [10]:
csv_rdd = spark.sparkContext.textFile("world_happiness_data/2020.csv")

## Reading a csv file into a DataFrame

In [19]:
df = spark.read.csv('world_happiness_data/2020.csv', header=True,inferSchema=True)

In [20]:
print("The count of the rows in the dataframe is :", df.count())

The count of the rows in the dataframe is : 153


In [21]:
print("The schema of the dataframe is :")
df.printSchema()

The schema of the dataframe is :
root
 |-- Country name: string (nullable = true)
 |-- Happiness Rank: integer (nullable = true)
 |-- Happiness score: double (nullable = true)
 |-- Upperwhisker: double (nullable = true)
 |-- Lowerwhisker: double (nullable = true)
 |-- Economy (GDP per Capita)\t: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)



In [22]:
selected_df = df.select("Country name", "Generosity", "Happiness score")

In [23]:
selected_df.show(5)

+------------+----------+---------------+
|Country name|Generosity|Happiness score|
+------------+----------+---------------+
|     Finland|      0.16|           7.81|
|     Denmark|      0.24|           7.65|
| Switzerland|      0.27|           7.56|
|     Iceland|      0.36|            7.5|
|      Norway|      0.29|           7.49|
+------------+----------+---------------+
only showing top 5 rows



In [25]:
filtered_df = selected_df.filter(selected_df["Happiness score"] > 7.5)
filtered_df.show(5)

+------------+----------+---------------+
|Country name|Generosity|Happiness score|
+------------+----------+---------------+
|     Finland|      0.16|           7.81|
|     Denmark|      0.24|           7.65|
| Switzerland|      0.27|           7.56|
+------------+----------+---------------+



In [32]:
max_df = df.groupBy("Country name").max("Happiness score")

In [None]:
# The correlation between "Happiness score" and "Generosity"
df.corr("Happiness score", "Generosity")

0.07176263525033995

In [36]:
df.createOrReplaceTempView("table1")

In [37]:
spark.sql("SELECT * FROM table1").show(5)

+------------+--------------+---------------+------------+------------+--------------------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Country name|Happiness Rank|Happiness score|Upperwhisker|Lowerwhisker|Economy (GDP per Capita)\t|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|
+------------+--------------+---------------+------------+------------+--------------------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|     Finland|             1|           7.81|        7.87|        7.75|                      1.29|           1.5|                   0.96|                        0.66|      0.16|                     0.48|
|     Denmark|             2|           7.65|        7.71|        7.58|                      1.33|           1.5|                   0.98|                        0.67|      0.24|       

In [42]:
spark.sql("SELECT `Country name`, MAX(`Happiness score`) as max_happiness_score FROM table1 GROUP BY `Country name` order by max_happiness_score desc").show()

+--------------+-------------------+
|  Country name|max_happiness_score|
+--------------+-------------------+
|       Finland|               7.81|
|       Denmark|               7.65|
|   Switzerland|               7.56|
|       Iceland|                7.5|
|        Norway|               7.49|
|   Netherlands|               7.45|
|        Sweden|               7.35|
|   New Zealand|                7.3|
|       Austria|               7.29|
|    Luxembourg|               7.24|
|        Canada|               7.23|
|     Australia|               7.22|
|United Kingdom|               7.16|
|        Israel|               7.13|
|    Costa Rica|               7.12|
|       Ireland|               7.09|
|       Germany|               7.08|
| United States|               6.94|
|Czech Republic|               6.91|
|       Belgium|               6.86|
+--------------+-------------------+
only showing top 20 rows



In [59]:
query = """
SELECT `Country name`, MIN(`Happiness score`) as max_happiness_score
 FROM table1 
 GROUP BY `Country name` 
 order by max_happiness_score asc
"""

In [60]:
results_df = spark.sql(query)

In [61]:
results_df.show(5)

+--------------------+-------------------+
|        Country name|max_happiness_score|
+--------------------+-------------------+
|         Afghanistan|               2.57|
|         South Sudan|               2.82|
|            Zimbabwe|                3.3|
|              Rwanda|               3.31|
|Central African R...|               3.48|
+--------------------+-------------------+
only showing top 5 rows



In [64]:
results_df.explain(mode="simple")

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [max_happiness_score#589 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(max_happiness_score#589 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=613]
      +- HashAggregate(keys=[Country name#17], functions=[min(Happiness score#19)])
         +- Exchange hashpartitioning(Country name#17, 200), ENSURE_REQUIREMENTS, [plan_id=610]
            +- HashAggregate(keys=[Country name#17], functions=[partial_min(Happiness score#19)])
               +- FileScan csv [Country name#17,Happiness score#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/HeskeyAmoakoFordjour/OneDrive - AmaliTech gGmbH/DE-labs..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Country name:string,Happiness score:double>


