In [1]:
import findspark
findspark.init('/usr/local/spark/')

In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('RW DFs').master('local[*]').getOrCreate()

# 1st Approach

In [17]:
moviesDf = spark.read.format('csv')\
            .option('header','true') \
            .option('inferSchema','true') \
            .option('delimiter',',') \
            .load('hdfs://localhost:54310/user/hduser/HFS/Input/movies.csv')
moviesDf.show(5, truncate = False)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



# 2nd Approach

In [16]:
moviesDf1 = spark.read \
                .options(header = True, inferSchema = True, delimiter = ',') \
                .csv('hdfs://localhost:54310/user/hduser/HFS/Input/movies.csv')
moviesDf1.show(5, truncate = False)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



In [9]:
# Before inferSchema
moviesDf.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [11]:
# After inferSchema
moviesDf.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
# Write Modes:
# 1) error: If the dir is not available it will create dir else it will give error.
# 2) ignore: If the dir is not avaialbe it will create dir else it will ignore.
# 3) append: If the dir is not avaialbe it will create dir else it will append data 
            # to existing dir.
# 4) overwrite: If the dir is not avaialbe it will create dir else it will overwrite the
            # existing dir.

In [37]:
moviesDf.write.mode('overwrite').format('csv')\
        .save('hdfs://localhost:54310/user/hduser/HFS/Output/wm_op3')

In [38]:
!hdfs dfs -ls /user/hduser/HFS/Output/wm_op3

Found 2 items
-rw-r--r--   3 hduser supergroup          0 2024-10-22 12:34 /user/hduser/HFS/Output/wm_op3/_SUCCESS
-rw-r--r--   3 hduser supergroup     484662 2024-10-22 12:34 /user/hduser/HFS/Output/wm_op3/part-00000-70aad07c-e413-4f30-926b-9fb854e4736b-c000.csv


In [39]:
# !hdfs dfs -cat /user/hduser/HFS/Output/wm_op1/part* | head -5
!hdfs dfs -cat /user/hduser/HFS/Output/wm_op3/part* | wc -l

9742
