# PySpark Intro

#### PySpark Session
- First is to create PySpark's Session

In [1]:
from pyspark.sql import SparkSession

In [2]:
# Start Spark Session: since this is local, so only can start 1 cluster
spark=SparkSession.builder.appName('Practise').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/01 22:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [4]:
df_pyspark = spark.read.csv('.././data/sample_1.csv')

                                                                                

In [5]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [6]:
df_pyspark.show()

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



- In this case, the pyspark does not take the first row as the header 
    - Solution: `spark.read.option('header', 'true')`

In [7]:
df_pyspark = spark.read.option('header', 'true').csv('../data/sample_1.csv')

In [8]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [10]:
df_pyspark.head(3)

[Row(Name='Krish', age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', age='29', Experience='4', Salary='20000')]

In [13]:
print(df_pyspark.rdd.getNumPartitions())

1


### Meta Info

In [16]:
df_pyspark.printSchema() #similar to df.info

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [25]:
df_pyspark.dtypes

[('Name', 'string'),
 ('age', 'string'),
 ('Experience', 'string'),
 ('Salary', 'string')]

In [26]:
df_pyspark.describe().show()

[Stage 9:>                                                          (0 + 1) / 1]

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|                8|             30000|
+-------+------+------------------+-----------------+------------------+



                                                                                

### Columns

#### Select Columns

In [17]:
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [19]:
name_col = df_pyspark.select('Name') #to select "Name" column

In [20]:
type(name_col)

pyspark.sql.dataframe.DataFrame

In [22]:
name_col.show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



#### Select Multiple Columns

In [23]:
df_pyspark.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



#### Add Columns

In [29]:
# Add Columns in Data Frame
df_pyspark = df_pyspark.withColumn('Exp After 2 Years', df_pyspark['Experience'] + 2)

In [30]:
df_pyspark.show()

+---------+---+----------+------+-----------------+
|     Name|age|Experience|Salary|Exp After 2 Years|
+---------+---+----------+------+-----------------+
|    Krish| 31|        10| 30000|             12.0|
|Sudhanshu| 30|         8| 25000|             10.0|
|    Sunny| 29|         4| 20000|              6.0|
|     Paul| 24|         3| 20000|              5.0|
|   Harsha| 21|         1| 15000|              3.0|
|  Shubham| 23|         2| 18000|              4.0|
+---------+---+----------+------+-----------------+



#### Drop Columns

In [31]:
df_pyspark = df_pyspark.drop('Exp After 2 Years')

In [32]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



#### Rename Columns

In [34]:
df_pyspark = df_pyspark.withColumnRenamed('Name', 'First Name')

In [35]:
df_pyspark.show()

+----------+---+----------+------+
|First Name|age|Experience|Salary|
+----------+---+----------+------+
|     Krish| 31|        10| 30000|
| Sudhanshu| 30|         8| 25000|
|     Sunny| 29|         4| 20000|
|      Paul| 24|         3| 20000|
|    Harsha| 21|         1| 15000|
|   Shubham| 23|         2| 18000|
+----------+---+----------+------+



### Handling Missing Values
- Dropping Columns
- Dropping Rows
- Various Parameter in Dropping Functionalities
- Handling Missing Values by Mean, Median & Mode

In [37]:
df_pyspark = spark.read.csv(".././data/sample_2_missing_values.csv", header=True, inferSchema=True)

In [38]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|  Mahesh |null|      null| 40000|
|     null|  34|        10|  null|
|     null|  36|      null|  null|
+---------+----+----------+------+



#### Drop the Columns

In [39]:
df_pyspark.drop("Name").show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10|  null|
|  36|      null|  null|
+----+----------+------+



#### Drop NA Rows 
- `.na.drop(how="")` to drop the rows container `null` values

In [40]:
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [43]:
df_pyspark.na.drop(how='any').show() #By default

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [41]:
df_pyspark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|  Mahesh |null|      null| 40000|
|     null|  34|        10|  null|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [14]:
spark.stop()