In [0]:
dbutils.fs.ls('mnt/rawdataset')

[FileInfo(path='dbfs:/mnt/rawdataset/Athletes.csv', name='Athletes.csv', size=484927, modificationTime=1742979787000),
 FileInfo(path='dbfs:/mnt/rawdataset/Coaches.csv', name='Coaches.csv', size=19752, modificationTime=1742979802000),
 FileInfo(path='dbfs:/mnt/rawdataset/Entries.csv', name='Entries.csv', size=1491, modificationTime=1742979817000),
 FileInfo(path='dbfs:/mnt/rawdataset/Medals.csv', name='Medals.csv', size=3711, modificationTime=1742979832000),
 FileInfo(path='dbfs:/mnt/rawdataset/Teams.csv', name='Teams.csv', size=41208, modificationTime=1742979848000)]

In [0]:
# Read Athletes.csv
athletes_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/rawdataset/Athletes.csv")

# Read Coaches.csv
coaches_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/rawdataset/Coaches.csv")

# Read Entries.csv
entries_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/rawdataset/Entries.csv")

# Read Medals.csv
medals_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/rawdataset/Medals.csv")

# Read Teams.csv
teams_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/rawdataset/Teams.csv")


In [0]:
athletes_df.show(5)
athletes_df.printSchema()
coaches_df.show(5)
coaches_df.printSchema()
entries_df.show(5)
entries_df.printSchema()
medals_df.show(5)
medals_df.printSchema()
teams_df.show(5)
teams_df.printSchema()

+-----------------+------+-------------------+
|             Name|   NOC|         Discipline|
+-----------------+------+-------------------+
|  AALERUD Katrine|Norway|       Cycling Road|
|      ABAD Nestor| Spain|Artistic Gymnastics|
|ABAGNALE Giovanni| Italy|             Rowing|
|   ABALDE Alberto| Spain|         Basketball|
|    ABALDE Tamara| Spain|         Basketball|
+-----------------+------+-------------------+
only showing top 5 rows

root
 |-- Name: string (nullable = true)
 |-- NOC: string (nullable = true)
 |-- Discipline: string (nullable = true)

+---------------+-------------+----------+-----+
|           Name|          NOC|Discipline|Event|
+---------------+-------------+----------+-----+
|ABDELMAGID Wael|        Egypt|  Football| NULL|
|      ABE Junya|        Japan|Volleyball| NULL|
|  ABE Katsuhiko|        Japan|Basketball| NULL|
|   ADAMA Cherif|Côte d'Ivoire|  Football| NULL|
|     AGEBA Yuya|        Japan|Volleyball| NULL|
+---------------+-------------+----------

In [0]:
dbutils.fs.ls('mnt/transformeddataset')

[]

In [0]:
# Step 1: Define the correct target path
target_path = "dbfs:/mnt/transformeddataset/"

# Step 2: Write CSV files (Spark will create 'part-*.csv')
athletes_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(target_path + "Athletes")
coaches_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(target_path + "Coaches")
entries_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(target_path + "Entries")
medals_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(target_path + "Medals")
teams_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(target_path + "Teams")

print("CSV files written successfully!")

# Step 3: Rename files after writing
def rename_csv_file(folder_name, new_name):
    files = dbutils.fs.ls(target_path + folder_name)
    for file in files:
        if file.name.startswith("part-") and file.name.endswith(".csv"):  
            source_path = file.path
            destination_path = target_path + folder_name + "/" + new_name
            dbutils.fs.mv(source_path, destination_path)  
            print(f"Renamed {file.name} to {new_name}")

rename_csv_file("Athletes", "Athletes.csv")
rename_csv_file("Coaches", "Coaches.csv")
rename_csv_file("Entries", "Entries.csv")
rename_csv_file("Medals", "Medals.csv")
rename_csv_file("Teams", "Teams.csv")

print("All files renamed successfully!")


CSV files written successfully!
Renamed part-00000-tid-1364689936785503009-e9fcb526-f0f8-4e9b-bca8-c99b88624b4b-156-1-c000.csv to Athletes.csv
Renamed part-00000-tid-8085162593155327292-cb96fe8f-9cba-4f15-bdbf-31c026f5ab8d-157-1-c000.csv to Coaches.csv
Renamed part-00000-tid-1911099204342266522-823cc716-bf8d-4ea2-b503-b161b21ec5c5-158-1-c000.csv to Entries.csv
Renamed part-00000-tid-1804874808333497218-300ce626-7d86-430f-aca1-9bc178192efb-159-1-c000.csv to Medals.csv
Renamed part-00000-tid-8869482876843796884-5a937e51-5155-47d8-ae9a-7f061cc06b7b-160-1-c000.csv to Teams.csv
All files renamed successfully!
