###### Dataframe is an immutable datastructure - You can not change it.
###### How are you going to process them?
###### You can apply Transformations.
###### Tell your driver that what you want to do and let driver decide how you want to do

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from lib.logger import Log4j
from lib.utils import get_spark_app_config
from lib.utils import load_survey_df

In [2]:
conf = get_spark_app_config()
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
logger = Log4j(spark)
logger.info("Starting HelloSpark")

In [4]:
survey_df = load_survey_df(spark,"data/sample.csv")

In [5]:
survey_df.where("Age>40").select("Age","Gender","Country","state").groupBy("Country")

<pyspark.sql.group.GroupedData at 0x1d970f60a00>

In [7]:
survey_df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

In [8]:
spark.stop()

# instead chaining everything altogether. you can visualize it separately by breaking into smaller steps

In [10]:
import findspark
findspark.init()

import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from lib.logger import Log4j
from lib.utils import get_spark_app_config
from lib.utils import load_survey_df

In [11]:
conf = get_spark_app_config()
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [12]:
logger = Log4j(spark)
logger.info("Starting HelloSpark")

In [14]:
survey_df = load_survey_df(spark,"data/sample.csv")
filtered_df = survey_df.where("Age<40").select("Age","Gender","Country","state")
grouped_df = filtered_df.groupBy("Country")
count_df = grouped_df.count()
count_df.show()

+--------------+-----+
|       Country|count|
+--------------+-----+
| United States|    4|
|        Canada|    2|
|United Kingdom|    1|
+--------------+-----+



In [15]:
spark.stop()