In [0]:
# Reading Text Files
text_df = spark.read.text("dbfs:/FileStore/tables/sample.txt")
text_df.show(truncate=False)

+-----------------------------+
|value                        |
+-----------------------------+
|Name,Age,Gender,City         |
|John,25,Male,New York        |
|Alice,30,Female,Los Angeles  |
|Bob,35,Male,Chicago          |
|Emily,28,Female,San Francisco|
|David,40,Male,Boston         |
|Sarah,32,Female,Miami        |
|Michael,45,Male,Seattle      |
+-----------------------------+



In [0]:
# Read csv file
csv_path = "dbfs:/FileStore/tables/sample_data.csv"
csv_df = spark.read.csv(csv_path, header=True)
csv_df1 = spark.read.format("csv").option("header","true").load(csv_path)
display(csv_df)
display(csv_df1)

Name,age,Experience
Basheer,23,2
Krishna,22,5
Basheer,23,2
Kuldeep,25,3


Name,age,Experience
Basheer,23,2
Krishna,22,5
Basheer,23,2
Kuldeep,25,3


In [0]:
# read json file
json_path = "dbfs:/FileStore/tables/multiline.json"
json_df = spark.read.format("json").option("multiline", "true").load(json_path)
display(json_df)

address,email,hexcolor,name,phoneNumber,userAgent
"989 Hoeger Motorway Izabellaburgh, WI 22677",urunte@swaniawski.com,#1af9a5,Edmond Weissnat,+1 (929) 622-3177,Opera/9.40 (Windows CE; sl-SI) Presto/2.11.209 Version/11.00
"86837 Dickens Fields Port Lemuel, WA 28565",lemard@parisian.org,#773865,Dr. Jamison Macejkovic IV,224.928.9760,Mozilla/5.0 (compatible; MSIE 6.0; Windows 98; Win 9x 4.90; Trident/4.0)
"9019 Powlowski Roads North Gudrun, AL 00428-2999",hessel.fausto@bradtke.com,#8b8195,Lupe Crooks,(850) 461-5436,Mozilla/5.0 (compatible; MSIE 8.0; Windows CE; Trident/3.1)
"9917 Lonie View Bayerfurt, AK 60241-0760",heller.brooks@hotmail.com,#c48c5a,Verona Harvey,+1-520-763-7772,"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_6_2 rv:2.0; sl-SI) AppleWebKit/531.36.3 (KHTML, like Gecko) Version/5.0 Safari/531.36.3"


In [0]:
# write parquet file
csv_df.write.parquet('dbfs:/FileStore/tables/parquet')

In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/tables/parquet"))

path,name,size,modificationTime
dbfs:/FileStore/tables/parquet/_SUCCESS,_SUCCESS,0,1712915409000
dbfs:/FileStore/tables/parquet/_committed_4640827672335448792,_committed_4640827672335448792,123,1712915409000
dbfs:/FileStore/tables/parquet/_started_4640827672335448792,_started_4640827672335448792,0,1712915404000
dbfs:/FileStore/tables/parquet/part-00000-tid-4640827672335448792-c7ed62ce-687d-459b-b898-e87009fc3a63-36-1-c000.snappy.parquet,part-00000-tid-4640827672335448792-c7ed62ce-687d-459b-b898-e87009fc3a63-36-1-c000.snappy.parquet,1075,1712915408000


In [0]:
# read parquet file
parquet_path = "dbfs:/FileStore/tables/parquet/part-00000-tid-4640827672335448792-c7ed62ce-687d-459b-b898-e87009fc3a63-36-1-c000.snappy.parquet"
parquet_df = spark.read.format("parquet").load(parquet_path)
display(parquet_df)

Name,age,Experience
Basheer,23,2
Krishna,22,5
Basheer,23,2
Kuldeep,25,3


In [0]:
# write ORC (Optimised Row Columnar)
csv_df.write.orc("dbfs:/FileStore/tables/orc")

In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/tables/orc"))

path,name,size,modificationTime
dbfs:/FileStore/tables/orc/_SUCCESS,_SUCCESS,0,1712916164000
dbfs:/FileStore/tables/orc/_committed_3057184157558628298,_committed_3057184157558628298,119,1712916163000
dbfs:/FileStore/tables/orc/_started_3057184157558628298,_started_3057184157558628298,0,1712916162000
dbfs:/FileStore/tables/orc/part-00000-tid-3057184157558628298-e2d4ca4f-78fc-4825-b990-8ae4061654c6-47-1-c000.snappy.orc,part-00000-tid-3057184157558628298-e2d4ca4f-78fc-4825-b990-8ae4061654c6-47-1-c000.snappy.orc,575,1712916163000


In [0]:
# read from ORC (Optimised Row Columnar)
orc_df = spark.read.orc("dbfs:/FileStore/tables/orc/part-00000-tid-3057184157558628298-e2d4ca4f-78fc-4825-b990-8ae4061654c6-47-1-c000.snappy.orc")
orc_df.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
|Basheer| 23|         2|
|Krishna| 22|         5|
|Basheer| 23|         2|
|Kuldeep| 25|         3|
+-------+---+----------+



In [0]:
# Write avro format
# Avro is a row-oriented remote procedure call and data serialization framework developed within Apache's Hadoop project.
csv_df.write.format("avro").save("dbfs:/FileStore/tables/avro")
display(dbutils.fs.ls("dbfs:/FileStore/tables/avro"))

path,name,size,modificationTime
dbfs:/FileStore/tables/avro/_SUCCESS,_SUCCESS,0,1712916444000
dbfs:/FileStore/tables/avro/_committed_17609906668230748,_committed_17609906668230748,111,1712916444000
dbfs:/FileStore/tables/avro/_started_17609906668230748,_started_17609906668230748,0,1712916443000
dbfs:/FileStore/tables/avro/part-00000-tid-17609906668230748-ace25dc2-5694-451f-8f50-96b3d5a5d7d8-57-1-c000.avro,part-00000-tid-17609906668230748-ace25dc2-5694-451f-8f50-96b3d5a5d7d8-57-1-c000.avro,338,1712916444000


In [0]:
# Read avro file
avro_df = spark.read.format("avro").load('dbfs:/FileStore/tables/avro/part-00000-tid-17609906668230748-ace25dc2-5694-451f-8f50-96b3d5a5d7d8-57-1-c000.avro')
avro_df.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
|Basheer| 23|         2|
|Krishna| 22|         5|
|Basheer| 23|         2|
|Kuldeep| 25|         3|
+-------+---+----------+



In [0]:
# create a dataframe
df = spark.createDataFrame([('basheer', 23),('ahmed', 24), ('kuldeep', 25)], ["name",'age'])
df.show()

+-------+---+
|   name|age|
+-------+---+
|basheer| 23|
|  ahmed| 24|
|kuldeep| 25|
+-------+---+



In [0]:
# write to csv file
df.write.format("csv").save('dbfs:/FileStore/tables/temp.csv')
display(dbutils.fs.ls("dbfs:/FileStore/tables/temp.csv"))

path,name,size,modificationTime
dbfs:/FileStore/tables/temp.csv/_SUCCESS,_SUCCESS,0,1712916985000
dbfs:/FileStore/tables/temp.csv/_committed_1184673270642846323,_committed_1184673270642846323,376,1712916985000
dbfs:/FileStore/tables/temp.csv/_started_1184673270642846323,_started_1184673270642846323,0,1712916984000
dbfs:/FileStore/tables/temp.csv/part-00000-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-83-1-c000.csv,part-00000-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-83-1-c000.csv,0,1712916984000
dbfs:/FileStore/tables/temp.csv/part-00002-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-85-1-c000.csv,part-00002-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-85-1-c000.csv,11,1712916984000
dbfs:/FileStore/tables/temp.csv/part-00005-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-88-1-c000.csv,part-00005-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-88-1-c000.csv,9,1712916984000
dbfs:/FileStore/tables/temp.csv/part-00007-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-90-1-c000.csv,part-00007-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-90-1-c000.csv,11,1712916984000


In [0]:
display(dbutils.fs.head("dbfs:/FileStore/tables/temp.csv/part-00002-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-85-1-c000.csv"),
dbutils.fs.head("dbfs:/FileStore/tables/temp.csv/part-00005-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-88-1-c000.csv"),
dbutils.fs.head("dbfs:/FileStore/tables/temp.csv/part-00007-tid-1184673270642846323-d2309bf0-56f3-4f91-a75f-1ca1984706f9-90-1-c000.csv"))

'basheer,23\n'

'ahmed,24\n'

'kuldeep,25\n'

In [0]:
# write csv in single file
df.coalesce(1).write.format("csv").save("dbfs:/FileStore/tables/csv_single_file")
display(dbutils.fs.ls("dbfs:/FileStore/tables/csv_single_file"))

path,name,size,modificationTime
dbfs:/FileStore/tables/csv_single_file/_SUCCESS,_SUCCESS,0,1712917248000
dbfs:/FileStore/tables/csv_single_file/_committed_5784927779037835640,_committed_5784927779037835640,112,1712917248000
dbfs:/FileStore/tables/csv_single_file/_started_5784927779037835640,_started_5784927779037835640,0,1712917247000
dbfs:/FileStore/tables/csv_single_file/part-00000-tid-5784927779037835640-5caa7c05-348d-4d04-8227-c0f3934785ed-99-1-c000.csv,part-00000-tid-5784927779037835640-5caa7c05-348d-4d04-8227-c0f3934785ed-99-1-c000.csv,31,1712917248000


In [0]:
# read writtern csv

csv_read_df = spark.read.csv("dbfs:/FileStore/tables/csv_single_file/part-00000-tid-5784927779037835640-5caa7c05-348d-4d04-8227-c0f3934785ed-99-1-c000.csv")
csv_read_df.show()

+-------+---+
|    _c0|_c1|
+-------+---+
|basheer| 23|
|  ahmed| 24|
|kuldeep| 25|
+-------+---+



In [0]:
# write json
df.coalesce(1).write.format("json").save('dbfs:/FileStore/tables/json_single_file')
display(dbutils.fs.ls("dbfs:/FileStore/tables/json_single_file"))

path,name,size,modificationTime
dbfs:/FileStore/tables/json_single_file/_SUCCESS,_SUCCESS,0,1712920205000
dbfs:/FileStore/tables/json_single_file/_committed_3953217717203999284,_committed_3953217717203999284,114,1712920205000
dbfs:/FileStore/tables/json_single_file/_started_3953217717203999284,_started_3953217717203999284,0,1712920204000
dbfs:/FileStore/tables/json_single_file/part-00000-tid-3953217717203999284-3ab954f0-1b16-4443-b179-5f6ea74cdde9-110-1-c000.json,part-00000-tid-3953217717203999284-3ab954f0-1b16-4443-b179-5f6ea74cdde9-110-1-c000.json,82,1712920205000


In [0]:
# read json
json_read_df = spark.read.json("dbfs:/FileStore/tables/json_single_file/part-00000-tid-3953217717203999284-3ab954f0-1b16-4443-b179-5f6ea74cdde9-110-1-c000.json")
json_read_df.show()

+---+-------+
|age|   name|
+---+-------+
| 23|basheer|
| 24|  ahmed|
| 25|kuldeep|
+---+-------+

