In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from lib.logger import Log4j
from lib.utils import get_spark_app_config

In [3]:
conf = get_spark_app_config()
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
logger = Log4j(spark)
logger.info("Starting HelloSpark")

spark Dataframe - Two dimentional data structure that inspired by pandas dataframe. They are distributed tables with named columns and well defined schema, means each column has specific data type - summary Dataframe is distributed table with column name and schema.
1. how we get column names?

In [5]:
df1 = spark.read.csv("data/sample.csv")

In [6]:
df1.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+--------------------+--------------------+------------+----------+--------------------+--------------------+------------------+---------------+--------+
|                _c0|_c1|   _c2|           _c3|  _c4|          _c5|           _c6|      _c7|           _c8|           _c9|       _c10|        _c11|      _c12|        _c13|            _c14|      _c15|      _c16|              _c17|                _c18|                _c19|        _c20|      _c21|                _c22|                _c23|              _c24|           _c25|    _c26|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------

csv is plain data text file hence flexible but complex to read. 
In csv first row is header row and next rows are data rows-but dataframe do not know this.
we need to tell this information to dataframereader to read data from csv correctly. you can use option() method for same

In [8]:
df2 = spark.read.option("header","true").csv("data/sample.csv")

In [9]:
df2.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

we infered the column name but what about schema?
2. How to infer schema? - for this you must use inferschema in option method. It reads portion of file and make intelligent guess about data type.

In [10]:
df3 = spark.read.option("header","true").option("inferSchema","true").csv("data/sample.csv")

In [11]:
df3.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

In [12]:
conf_out = spark.sparkContext.getConf()
logger.info(conf_out.toDebugString())
logger.info("Finished HelloSpark")

In [13]:
spark.stop()

# Modular Structure - Lets create separate function for reusability

In [15]:
import findspark
findspark.init()

import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from lib.logger import Log4j
from lib.utils import get_spark_app_config

conf = get_spark_app_config()
spark = SparkSession.builder.config(conf=conf).getOrCreate()

logger = Log4j(spark)
logger.info("Starting HelloSpark")

In [16]:
from lib.utils import load_survey_df

suvey_df = load_survey_df(spark,"data/sample.csv")

suvey_df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

In [17]:
conf_out = spark.sparkContext.getConf()
logger.info(conf_out.toDebugString())
logger.info("Finished HelloSpark")

In [18]:
spark.stop()