In [67]:
# !pip install pyspark

# Part 1

In [68]:
import pyspark

In [69]:
import pandas as pd 
pd.read_csv('test1.csv')

Unnamed: 0,name,age
0,dave,31
1,bob,30
2,phil,29


In [70]:
# before we start loading data with spark we need to start a SPARK SESSION

In [71]:
from pyspark.sql import SparkSession

In [72]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [73]:
spark

In [74]:
df_pyspark = spark.read.csv('test1.csv')

In [75]:
df_pyspark

DataFrame[_c0: string, _c1: string]

In [76]:
df_pyspark.show()

+----+---+
| _c0|_c1|
+----+---+
|name|age|
|dave| 31|
| bob| 30|
|phil| 29|
+----+---+



In [77]:
spark.read.option('header', 'true').csv('test1.csv').show()

+----+---+
|name|age|
+----+---+
|dave| 31|
| bob| 30|
|phil| 29|
+----+---+



In [78]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv')

In [79]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [80]:
df_pyspark.head(3)

[Row(name='dave', age='31'),
 Row(name='bob', age='30'),
 Row(name='phil', age='29')]

In [81]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)



# Part 2

In [82]:
from pyspark.sql import SparkSession

In [84]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [85]:
spark

In [86]:
# when we are running from my memory on my PC - this is shown as 'local' - ie one node

In [87]:
## read the data set

spark.read.option('header', 'true').csv('test1.csv')

DataFrame[name: string, age: string, experience : string]

In [88]:
spark.read.option('header', 'true').csv('test1.csv').show()

+----+---+-----------+
|name|age|experience |
+----+---+-----------+
|dave| 31|         10|
| bob| 30|          8|
|phil| 29|          4|
+----+---+-----------+



In [89]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv')

In [90]:
### check the schema

df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- experience : string (nullable = true)



In [92]:
# by default they have been as strings

# to change this we need to input the variable within ''.csv(inferSchema = True)'

In [93]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv',inferSchema = True)

In [94]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience : integer (nullable = true)



In [95]:
# another way of reading data 

In [96]:
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema = True)
df_pyspark.show()

+----+---+-----------+
|name|age|experience |
+----+---+-----------+
|dave| 31|         10|
| bob| 30|          8|
|phil| 29|          4|
+----+---+-----------+



In [97]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [99]:
# ie what is a dataframe? - is it one kind of data structure, ie columns and rows 

In [109]:
# how to select columns

In [100]:
df_pyspark.columns

['name', 'age', 'experience ']

In [101]:
df_pyspark.head(3)

[Row(name='dave', age=31, experience =10),
 Row(name='bob', age=30, experience =8),
 Row(name='phil', age=29, experience =4)]

In [102]:
df_pyspark.show()

+----+---+-----------+
|name|age|experience |
+----+---+-----------+
|dave| 31|         10|
| bob| 30|          8|
|phil| 29|          4|
+----+---+-----------+



In [103]:
df_pyspark.select('name')

DataFrame[name: string]

In [104]:
df_pyspark.select('name').show()

+----+
|name|
+----+
|dave|
| bob|
|phil|
+----+



In [106]:
type(df_pyspark.select('name'))

pyspark.sql.dataframe.DataFrame

In [107]:
df_pyspark.select('name', 'age')

DataFrame[name: string, age: int]

In [110]:
df_pyspark.select(['name', 'age']).show()

+----+---+
|name|age|
+----+---+
|dave| 31|
| bob| 30|
|phil| 29|
+----+---+



In [112]:
# how to check data types 

In [116]:
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience ', 'int')]

In [117]:
df_pyspark.describe().show()

+-------+----+----+-----------------+
|summary|name| age|      experience |
+-------+----+----+-----------------+
|  count|   3|   3|                3|
|   mean|NULL|30.0|7.333333333333333|
| stddev|NULL| 1.0|3.055050463303893|
|    min| bob|  29|                4|
|    max|phil|  31|               10|
+-------+----+----+-----------------+



In [118]:
# adding columns

In [120]:
df_pyspark.withColumn('experience after 2 years', df_pyspark['experience ']+2)

DataFrame[name: string, age: int, experience : int, experience after 2 years: int]

In [122]:
df_pyspark.withColumn('experience after 2 years', df_pyspark['experience ']+2).show()

+----+---+-----------+------------------------+
|name|age|experience |experience after 2 years|
+----+---+-----------+------------------------+
|dave| 31|         10|                      12|
| bob| 30|          8|                      10|
|phil| 29|          4|                       6|
+----+---+-----------+------------------------+



In [123]:
df_pyspark = df_pyspark.withColumn('experience after 2 years', df_pyspark['experience ']+2)

In [124]:
df_pyspark.show()

+----+---+-----------+------------------------+
|name|age|experience |experience after 2 years|
+----+---+-----------+------------------------+
|dave| 31|         10|                      12|
| bob| 30|          8|                      10|
|phil| 29|          4|                       6|
+----+---+-----------+------------------------+



In [125]:
# drop columns 

In [126]:
df_pyspark.drop('experience after 2 years').show()

+----+---+-----------+
|name|age|experience |
+----+---+-----------+
|dave| 31|         10|
| bob| 30|          8|
|phil| 29|          4|
+----+---+-----------+



In [127]:
df_pyspark = df_pyspark.drop('experience after 2 years')

In [128]:
df_pyspark.show()

+----+---+-----------+
|name|age|experience |
+----+---+-----------+
|dave| 31|         10|
| bob| 30|          8|
|phil| 29|          4|
+----+---+-----------+



In [129]:
# rename the columns 

In [130]:
df_pyspark.withColumnRenamed('name', 'new name').show()

+--------+---+-----------+
|new name|age|experience |
+--------+---+-----------+
|    dave| 31|         10|
|     bob| 30|          8|
|    phil| 29|          4|
+--------+---+-----------+



# Part 3

In [131]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [134]:
spark.read.csv('test1.csv', header = True, inferSchema = True)

+-----+----+-----------+------+
| name| age|experience |salary|
+-----+----+-----------+------+
| dave|  31|         10| 30000|
|  bob|  30|          8| 25000|
| phil|  29|          4| 20000|
|gemma|  24|       NULL| 20000|
|  rob|  51|       NULL| 15000|
|  gab|  28|       NULL| 18000|
| toby|NULL|       NULL| 40000|
| NULL|  34|         10| 38000|
| NULL|  36|       NULL|  NULL|
+-----+----+-----------+------+



In [135]:
df = spark.read.csv('test1.csv', header = True, inferSchema = True)

In [136]:
df.show()

+-----+----+-----------+------+
| name| age|experience |salary|
+-----+----+-----------+------+
| dave|  31|         10| 30000|
|  bob|  30|          8| 25000|
| phil|  29|          4| 20000|
|gemma|  24|       NULL| 20000|
|  rob|  51|       NULL| 15000|
|  gab|  28|       NULL| 18000|
| toby|NULL|       NULL| 40000|
| NULL|  34|         10| 38000|
| NULL|  36|       NULL|  NULL|
+-----+----+-----------+------+



In [137]:
# drop the columns

In [138]:
df.drop('name').show()

+----+-----------+------+
| age|experience |salary|
+----+-----------+------+
|  31|         10| 30000|
|  30|          8| 25000|
|  29|          4| 20000|
|  24|       NULL| 20000|
|  51|       NULL| 15000|
|  28|       NULL| 18000|
|NULL|       NULL| 40000|
|  34|         10| 38000|
|  36|       NULL|  NULL|
+----+-----------+------+



In [140]:
# drop rows depending on nulls

In [142]:
# where there is a null, the whole row is dropped 

df.na.drop().show()

+----+---+-----------+------+
|name|age|experience |salary|
+----+---+-----------+------+
|dave| 31|         10| 30000|
| bob| 30|          8| 25000|
|phil| 29|          4| 20000|
+----+---+-----------+------+



In [None]:
# how variable

In [144]:
# if all rows have nulls

df.na.drop(how = 'all').show()

+-----+----+-----------+------+
| name| age|experience |salary|
+-----+----+-----------+------+
| dave|  31|         10| 30000|
|  bob|  30|          8| 25000|
| phil|  29|          4| 20000|
|gemma|  24|       NULL| 20000|
|  rob|  51|       NULL| 15000|
|  gab|  28|       NULL| 18000|
| toby|NULL|       NULL| 40000|
| NULL|  34|         10| 38000|
| NULL|  36|       NULL|  NULL|
+-----+----+-----------+------+



In [145]:
# if any rows have nulls they will be dropped

df.na.drop(how = 'any').show()

+----+---+-----------+------+
|name|age|experience |salary|
+----+---+-----------+------+
|dave| 31|         10| 30000|
| bob| 30|          8| 25000|
|phil| 29|          4| 20000|
+----+---+-----------+------+



In [146]:
# threshold

In [149]:
df.na.drop(how = 'all', thresh = 2).show()

+-----+----+-----------+------+
| name| age|experience |salary|
+-----+----+-----------+------+
| dave|  31|         10| 30000|
|  bob|  30|          8| 25000|
| phil|  29|          4| 20000|
|gemma|  24|       NULL| 20000|
|  rob|  51|       NULL| 15000|
|  gab|  28|       NULL| 18000|
| toby|NULL|       NULL| 40000|
| NULL|  34|         10| 38000|
+-----+----+-----------+------+

