In [14]:
#Importing Libs and Creating Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring,to_date,concat,lit,col,split

spark = SparkSession.builder.appName("DataTransformation").getOrCreate()

In [15]:
#Reading CSV file
df = spark.read.format("csv").load("C:\\Users\\Ahmed\\OneDrive\\Desktop\\SingleTable\\cleanDS",header=True)

In [16]:
#Printing Schema
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Type: string (nullable = true)



In [17]:
#Printing first 20 rows
df.show(20)

+----------+-----+--------------------+--------------------+
|      Date| Time|            Location|                Type|
+----------+-----+--------------------+--------------------+
|01/19/1930|18:23|Oceanside, Califo...|Ford 5-AT-C Tri M...|
|03/31/1931|10:45|      Bazaar, Kansas|Fokker F10A Trimotor|
|08/31/1934|23:42|  Amazonia, Missouri|   Stinson  SM-6000B|
|05/06/1935|03:30|   Atlanta, Missouri|    Douglas DC-2-112|
|08/14/1935|23:45|  Near Gilmer, Texas|     Stinson Model A|
|10/07/1935|02:19|Near Cheyenne, Wy...|         Boeing 247D|
|01/14/1936|19:32|   Goodwin, Arkansas|    Douglas DC-2-120|
|04/07/1936|10:20|Uniontown, Pennsy...|    Douglas DC-2-112|
|12/15/1936|03:14|Near Salt Lake Ci...|         Boeing 247D|
|12/19/1936|20:47|Near Milford, Pen...|    Douglas DC-2-112|
|12/27/1936|19:38| Newhall, California|        Boeing 247-D|
|01/12/1937|11:07|Near Newhall, Cal...|         Boeing 247B|
|02/09/1937|20:50|Off San Francisco...|       Douglas DC-3A|
|03/25/1937|18:40|Clifto

In [18]:
#Transforming the Date column from DD/MM/YYYY format to YYYY format for easier operations
df = df.withColumn("Date", substring(df["Date"], 7, 4))


In [19]:
df.show()

+----+-----+--------------------+--------------------+
|Date| Time|            Location|                Type|
+----+-----+--------------------+--------------------+
|1930|18:23|Oceanside, Califo...|Ford 5-AT-C Tri M...|
|1931|10:45|      Bazaar, Kansas|Fokker F10A Trimotor|
|1934|23:42|  Amazonia, Missouri|   Stinson  SM-6000B|
|1935|03:30|   Atlanta, Missouri|    Douglas DC-2-112|
|1935|23:45|  Near Gilmer, Texas|     Stinson Model A|
|1935|02:19|Near Cheyenne, Wy...|         Boeing 247D|
|1936|19:32|   Goodwin, Arkansas|    Douglas DC-2-120|
|1936|10:20|Uniontown, Pennsy...|    Douglas DC-2-112|
|1936|03:14|Near Salt Lake Ci...|         Boeing 247D|
|1936|20:47|Near Milford, Pen...|    Douglas DC-2-112|
|1936|19:38| Newhall, California|        Boeing 247-D|
|1937|11:07|Near Newhall, Cal...|         Boeing 247B|
|1937|20:50|Off San Francisco...|       Douglas DC-3A|
|1937|18:40|Clifton, Pennsylv...|    Douglas DC-2-112|
|1937|19:25|Lakehurst, New Je...|     Zeppelin LZ-129|
|1937|04:4

In [20]:


# Created a new Dataframe Grouping by the year and number of crashes in each year
CrashesInYear = df.groupBy("Date").count()
CrashesInYear = CrashesInYear.withColumnRenamed("count", "CrashesInYear")
# Join the original DataFrame with the crash_counts DataFrame on the Date column
df = df.join(CrashesInYear, "Date", "left_outer")
# Show the resulting DataFrame
df.show()

+----+-----+--------------------+--------------------+-------------+
|Date| Time|            Location|                Type|CrashesInYear|
+----+-----+--------------------+--------------------+-------------+
|1930|18:23|Oceanside, Califo...|Ford 5-AT-C Tri M...|            1|
|1931|10:45|      Bazaar, Kansas|Fokker F10A Trimotor|            1|
|1934|23:42|  Amazonia, Missouri|   Stinson  SM-6000B|            1|
|1935|03:30|   Atlanta, Missouri|    Douglas DC-2-112|            3|
|1935|23:45|  Near Gilmer, Texas|     Stinson Model A|            3|
|1935|02:19|Near Cheyenne, Wy...|         Boeing 247D|            3|
|1936|19:32|   Goodwin, Arkansas|    Douglas DC-2-120|            5|
|1936|10:20|Uniontown, Pennsy...|    Douglas DC-2-112|            5|
|1936|03:14|Near Salt Lake Ci...|         Boeing 247D|            5|
|1936|20:47|Near Milford, Pen...|    Douglas DC-2-112|            5|
|1936|19:38| Newhall, California|        Boeing 247-D|            5|
|1937|11:07|Near Newhall, Cal...| 

In [21]:
#Split the Location Column and Only Kept the State's Name
df = df.withColumn("Location", split("Location", ",").getItem(1))
df.show(10)

+----+-----+-------------+--------------------+-------------+
|Date| Time|     Location|                Type|CrashesInYear|
+----+-----+-------------+--------------------+-------------+
|1930|18:23|   California|Ford 5-AT-C Tri M...|            1|
|1931|10:45|       Kansas|Fokker F10A Trimotor|            1|
|1934|23:42|     Missouri|   Stinson  SM-6000B|            1|
|1935|03:30|     Missouri|    Douglas DC-2-112|            3|
|1935|23:45|        Texas|     Stinson Model A|            3|
|1935|02:19|      Wyoming|         Boeing 247D|            3|
|1936|19:32|     Arkansas|    Douglas DC-2-120|            5|
|1936|10:20| Pennsylvania|    Douglas DC-2-112|            5|
|1936|03:14|         Utah|         Boeing 247D|            5|
|1936|20:47| Pennsylvania|    Douglas DC-2-112|            5|
+----+-----+-------------+--------------------+-------------+
only showing top 10 rows



In [22]:
# Created a new Dataframe Grouping by the year and number of crashes in each year
CrashesInPlace = df.groupBy("Location").count()
CrashesInPlace = CrashesInPlace.withColumnRenamed("count", "CrashesInPlace")
# Join the original DataFrame with the crash_counts DataFrame on the Date column
df = df.join(CrashesInPlace, "Location", "left_outer")
# Show the resulting DataFrame
df.show()

+-------------+----+-----+--------------------+-------------+--------------+
|     Location|Date| Time|                Type|CrashesInYear|CrashesInPlace|
+-------------+----+-----+--------------------+-------------+--------------+
|   California|1930|18:23|Ford 5-AT-C Tri M...|            1|            43|
|       Kansas|1931|10:45|Fokker F10A Trimotor|            1|             1|
|     Missouri|1934|23:42|   Stinson  SM-6000B|            1|             9|
|     Missouri|1935|03:30|    Douglas DC-2-112|            3|             9|
|        Texas|1935|23:45|     Stinson Model A|            3|            13|
|      Wyoming|1935|02:19|         Boeing 247D|            3|             8|
|     Arkansas|1936|19:32|    Douglas DC-2-120|            5|             3|
| Pennsylvania|1936|10:20|    Douglas DC-2-112|            5|            14|
|         Utah|1936|03:14|         Boeing 247D|            5|             9|
| Pennsylvania|1936|20:47|    Douglas DC-2-112|            5|            14|

In [23]:
# Created a new Dataframe Grouping by the year and number of crashes in each year
FailureRate = df.groupBy("Type").count()
FailureRate = FailureRate.withColumnRenamed("count", "FailureRate")
# Join the original DataFrame with the crash_counts DataFrame on the Date column
df = df.join(FailureRate, "Type", "left_outer")
# Show the resulting DataFrame
df.show(10)

+--------------------+-------------+----+-----+-------------+--------------+-----------+
|                Type|     Location|Date| Time|CrashesInYear|CrashesInPlace|FailureRate|
+--------------------+-------------+----+-----+-------------+--------------+-----------+
|Ford 5-AT-C Tri M...|   California|1930|18:23|            1|            43|          1|
|Fokker F10A Trimotor|       Kansas|1931|10:45|            1|             1|          1|
|   Stinson  SM-6000B|     Missouri|1934|23:42|            1|             9|          1|
|    Douglas DC-2-112|     Missouri|1935|03:30|            3|             9|          6|
|     Stinson Model A|        Texas|1935|23:45|            3|            13|          1|
|         Boeing 247D|      Wyoming|1935|02:19|            3|             8|          2|
|    Douglas DC-2-120|     Arkansas|1936|19:32|            5|             3|          1|
|    Douglas DC-2-112| Pennsylvania|1936|10:20|            5|            14|          6|
|         Boeing 247D

In [24]:
#Cleaning the Data and Dropping Unnecessary Columns
finalDF = df.dropna().drop("Route","Time").sort("Date",ascending=True)
#Displaying the Transformed Dataset
finalDF.select("Date","CrashesInYear","Location","CrashesInPlace","Type","FailureRate").show()

+----+-------------+-------------+--------------+--------------------+-----------+
|Date|CrashesInYear|     Location|CrashesInPlace|                Type|FailureRate|
+----+-------------+-------------+--------------+--------------------+-----------+
|1930|            1|   California|            43|Ford 5-AT-C Tri M...|          1|
|1931|            1|       Kansas|             1|Fokker F10A Trimotor|          1|
|1934|            1|     Missouri|             9|   Stinson  SM-6000B|          1|
|1935|            3|     Missouri|             9|    Douglas DC-2-112|          6|
|1935|            3|        Texas|            13|     Stinson Model A|          1|
|1935|            3|      Wyoming|             8|         Boeing 247D|          2|
|1936|            5|     Arkansas|             3|    Douglas DC-2-120|          1|
|1936|            5| Pennsylvania|            14|    Douglas DC-2-112|          6|
|1936|            5|         Utah|             9|         Boeing 247D|          2|
|193

In [25]:
#Splitting the new dataset to 2 separate datasets
percent = [0.5,0.5]
splitCSV = finalDF.randomSplit(weights=percent,seed=21)

#Assigning the split CSV files to Variables
DS1 = splitCSV[0]
DS2 = splitCSV[1]

#Saving new datasets
DS1.write.csv("C:\\Users\\Ahmed\\OneDrive\\Desktop\\SingleTable\\DS1", header=True, mode="overwrite")
DS2.write.csv("C:\\Users\\Ahmed\\OneDrive\\Desktop\\SingleTable\\DS2", header=True, mode="overwrite")

In [None]:
#Stopping spark session
spark.stop()