In [None]:
pip install findspark

In [None]:
pip install pyspark

In [3]:
import pyspark
from pyspark.sql import SparkSession
#SparkSession.builder
 #    .master("local")
  #   .appName("Word Count")
   #  .config("spark.some.config.option", "some-value")
   #  .getOrCreate()

spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [4]:
spark

In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sql_sc = SQLContext(spark)
repository='C:/Users/adam_/Desktop/Adam_Work_Spark/trips_spark/Shared_Micromobility_Vehicle_Trips.csv'
Austin_Trips_Df = spark.read.csv(repository,header='True',inferSchema='true') 


In [6]:
Austin_Trips_Df.count()

2115445

In [7]:
Austin_Trips_Df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Device ID: string (nullable = true)
 |-- Vehicle Type: string (nullable = true)
 |-- Trip Duration: integer (nullable = true)
 |-- Trip Distance: integer (nullable = true)
 |-- Start Time: string (nullable = true)
 |-- End Time: string (nullable = true)
 |-- Modified Date: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Day of Week: integer (nullable = true)
 |-- Council District (Start): integer (nullable = true)
 |-- Council District (End): integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Census Tract Start: string (nullable = true)
 |-- Census Tract End: string (nullable = true)



# Data cleansing and transformations:

We start our cleansing by running some basic Pyspark functions as shown below:


In [8]:
 Austin_Trips_Df = Austin_Trips_Df.dropDuplicates()

In [9]:
Austin_Trips_Df.dropna().count()

2115437

We find out 8 duplicates rows.
We decide to cast and transform Start Time and End Time columns in order to transform them from String type to timestamp one

In [10]:
from pyspark.sql.functions import unix_timestamp
pattern ='MM/dd/yyyy hh:mm:ss aa'
Austin_Trips_Df=Austin_Trips_Df.withColumn('Start Time',unix_timestamp(Austin_Trips_Df['Start Time'],pattern).cast("timestamp"))\
.withColumn('End Time',unix_timestamp(Austin_Trips_Df['End Time'],pattern).cast("timestamp"))

We continue our transformation operations and we decide to convert Trip distance from Meter to Kilometer and Trip duration from Second to Hour as well.

In [11]:
Austin_Trips_Df=Austin_Trips_Df.withColumn('Trip Distance',(Austin_Trips_Df['Trip Distance'])/1000).withColumnRenamed('Trip Distance', 'Trip_Distance(km)').withColumn('Trip Duration',(Austin_Trips_Df['Trip Duration'])/3600).withColumnRenamed('Trip Duration', 'Trip_Duration(H)')

We decide to remove non used-columns such as Year and modified date

In [12]:
Austin_Trips_Df=Austin_Trips_Df.drop(Austin_Trips_Df['Modified Date']).drop(Austin_Trips_Df['Year'])

Checking the new schema as result of our previous operations 

In [13]:
Austin_Trips_Df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Device ID: string (nullable = true)
 |-- Vehicle Type: string (nullable = true)
 |-- Trip_Duration(H): double (nullable = true)
 |-- Trip_Distance(km): double (nullable = true)
 |-- Start Time: timestamp (nullable = true)
 |-- End Time: timestamp (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Day of Week: integer (nullable = true)
 |-- Council District (Start): integer (nullable = true)
 |-- Council District (End): integer (nullable = true)
 |-- Census Tract Start: string (nullable = true)
 |-- Census Tract End: string (nullable = true)



In [14]:
Austin_Trips_Df.select("Trip_Duration(H)","Trip_Distance(km)").describe().show(truncate= True)

+-------+-------------------+------------------+
|summary|   Trip_Duration(H)| Trip_Distance(km)|
+-------+-------------------+------------------+
|  count|            2115445|           2115445|
|   mean|0.16955681625694208|45.609249623601634|
| stddev| 0.2429257070840978|1936.5365902717479|
|    min|             -0.315|               0.0|
|    max|            23.5475|       1113117.831|
+-------+-------------------+------------------+



In [15]:
Austin_Trips_Df=Austin_Trips_Df.filter(Austin_Trips_Df['Trip_Duration(H)']>0)

In [16]:
Austin_Trips_Df.count()

2115406

In [17]:
Austin_Trips_Df.crosstab('Vehicle Type', 'Month').show()

+------------------+------+---+------+------+------+
|Vehicle Type_Month|    10| 11|     7|     8|     9|
+------------------+------+---+------+------+------+
|           scooter|503906| 88|471763|471527|557695|
|           bicycle| 26521|  0| 31633| 25036| 27237|
+------------------+------+---+------+------+------+



In [18]:
Austin_Trips_Df.crosstab('Day of Week', 'Month').show()

+-----------------+------+---+-----+------+------+
|Day of Week_Month|    10| 11|    7|     8|     9|
+-----------------+------+---+-----+------+------+
|                0| 78276|  0|67526| 60053| 92386|
|                5| 82200| 88|83536| 94825| 91179|
|                1| 58880|  0|57008| 45884| 76615|
|                6|101599|  0|96908|118696|126083|
|                2| 67055|  0|62289| 49164| 61974|
|                3| 67917|  0|65831| 55822| 65591|
|                4| 74500|  0|70298| 72119| 71104|
+-----------------+------+---+-----+------+------+



In [19]:
Austin_Trips_Df=Austin_Trips_Df.withColumn('Speed',Austin_Trips_Df['Trip_Distance(km)']/Austin_Trips_Df['Trip_Duration(H)'])

In [20]:
Austin_Trips_Df.describe(['Speed']).show()

+-------+------------------+
|summary|             Speed|
+-------+------------------+
|  count|           2115406|
|   mean|431.76059691297746|
| stddev|17208.996380778306|
|    min|               0.0|
|    max| 5242221.507692307|
+-------+------------------+



In [24]:
Austin_Trips_Df=Austin_Trips_Df.withColumn('Revenue($)',(Austin_Trips_Df['Trip_Duration(H)']*60)*0.15+1)

In [25]:
Austin_Trips_Df.describe(['Revenue($)']).show()

+-------+------------------+
|summary|        Revenue($)|
+-------+------------------+
|  count|           2115406|
|   mean| 2.526046323968071|
| stddev|2.1863338249025457|
|    min|            1.0025|
|    max|212.92749999999998|
+-------+------------------+



In [26]:
Austin_Trips_Df.head(2)

[Row(ID='000f0d96-dd73-452d-bb01-e817adaeab0d', Device ID='4a22632e-89db-42e9-88d4-f8bfff68b677', Vehicle Type='scooter', Trip_Duration(H)=0.5366666666666666, Trip_Distance(km)=4.675, Start Time=datetime.datetime(2019, 7, 7, 20, 15), End Time=datetime.datetime(2019, 7, 7, 20, 45), Month=7, Hour=20, Day of Week=0, Council District (Start)=5, Council District (End)=5, Census Tract Start='48453001303', Census Tract End='48453001303', Speed=8.711180124223603, Revenue($)=5.829999999999999),
 Row(ID='008286fd-ad90-4611-b44b-bd169fa37964', Device ID='23716c9d-2821-4a7f-8bd7-3593212f69cd', Vehicle Type='scooter', Trip_Duration(H)=0.10361111111111111, Trip_Distance(km)=2.139, Start Time=datetime.datetime(2019, 7, 10, 9, 45), End Time=datetime.datetime(2019, 7, 10, 9, 45), Month=7, Hour=9, Day of Week=3, Council District (Start)=9, Council District (End)=9, Census Tract Start='48453000603', Census Tract End='48453000601', Speed=20.64450402144772, Revenue($)=1.9325)]

In [None]:
PriceDf.crosstab('Trip_Duration(H)','Revenue($)').show(3)