In [1]:
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.appName("Strava Analysis").getOrCreate()

In [3]:
activites_df = spark.read.csv("data/activities.csv", header=True, inferSchema=True)

In [4]:
activites_df.show()

+-----------+--------------------+--------------------+-----------------+--------------------+-------------+---------+---------------+----------------+--------+---------------------+--------------------+--------------------+--------------+-----------+--------------+-----------+----------+---------+-------------+--------------+--------------+-------------+--------------+---------+-------------+----------------------+----------------------+-----------+---------------+----------------+------------------+---------+-------------+--------+---------------+-------------------+-----------------+----------+--------------+-----------+-------------+----------+------------------+----+----------+----------------------+-----------+-------------------------+-------------------------+---------+-------------------+-----------+-----------------------+------------------------+-----------------+-------------------+--------------------+--------+--------+----------------+----------+---------+------------+---

In [5]:
from pyspark.sql import functions as F

In [6]:
activites_df = activites_df.withColumn('Activity Date', F.to_timestamp('Activity Date', 'MMM d, yyyy, h:mm:ss a'))

In [7]:
from pyspark.sql.functions import year, month, day, weekday

In [8]:
activites_df = activites_df.withColumn('Year', year('Activity Date')).withColumn('Month', month('Activity Date')).withColumn('Day', day('Activity Date')).withColumn('Weekday', weekday('Activity Date'))

In [9]:
activites_df.columns

['Activity ID',
 'Activity Date',
 'Activity Name',
 'Activity Type',
 'Activity Description',
 'Elapsed Time5',
 'Distance6',
 'Max Heart Rate7',
 'Relative Effort8',
 'Commute9',
 'Activity Private Note',
 'Activity Gear',
 'Filename',
 'Athlete Weight',
 'Bike Weight',
 'Elapsed Time15',
 'Moving Time',
 'Distance17',
 'Max Speed',
 'Average Speed',
 'Elevation Gain',
 'Elevation Loss',
 'Elevation Low',
 'Elevation High',
 'Max Grade',
 'Average Grade',
 'Average Positive Grade',
 'Average Negative Grade',
 'Max Cadence',
 'Average Cadence',
 'Max Heart Rate30',
 'Average Heart Rate',
 'Max Watts',
 'Average Watts',
 'Calories',
 'Max Temperature',
 'Average Temperature',
 'Relative Effort37',
 'Total Work',
 'Number of Runs',
 'Uphill Time',
 'Downhill Time',
 'Other Time',
 'Perceived Exertion',
 'Type',
 'Start Time',
 'Weighted Average Power',
 'Power Count',
 'Prefer Perceived Exertion',
 'Perceived Relative Effort',
 'Commute50',
 'Total Weight Lifted',
 'From Upload',
 'Grad

In [10]:
activites_df = activites_df.drop('Elapsed Time15', 'Distance17', 'Max Heart Rate30', 'Relative Effort37', 'Commute50') \
    .withColumnRenamed('Elapsed Time5', 'Elapsed Time') \
    .withColumnRenamed('Distance6', 'Distance') \
    .withColumnRenamed('Max Heart Rate7', 'Max Heart Rate') \
    .withColumnRenamed('Relative Effort8', 'Relative Effort') \
    .withColumnRenamed('Commute9', 'Commute')

In [21]:
activites_df = activites_df.withColumn('Max Speed', activites_df['Max Speed'] * 3.6) \
    .withColumn('Average Speed', activites_df['Average Speed'] * 3.6)

In [22]:
activites_df.show()

+-----------+-------------------+--------------------+-----------------+--------------------+------------+--------+--------------+---------------+-------+---------------------+--------------------+--------------------+--------------+-----------+-----------+------------------+------------------+--------------+--------------+-------------+--------------+---------+-------------+----------------------+----------------------+-----------+---------------+------------------+---------+-------------+--------+---------------+-------------------+----------+--------------+-----------+-------------+----------+------------------+----+----------+----------------------+-----------+-------------------------+-------------------------+-------------------+-----------+-----------------------+------------------------+-----------------+-------------------+--------------------+--------+--------+----------------+----------+---------+------------+-----------------------+------------+-----------+----------+------

In [23]:
run_df = activites_df.filter(activites_df['Activity Type'] == 'Run') \
    .select('Activity Date', 'Year', 'Month', 'day', 'weekday', 'Elapsed Time', 'Moving Time', 'Distance', 'Max Heart Rate', 'Average Heart Rate', 'Activity Gear', 'Max Speed', 'Average Speed', 'Max Cadence', 'Average Cadence', 'Calories')

In [24]:
run_df.show()

+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+--------------------+------------------+------------------+-----------+---------------+--------+
|      Activity Date|Year|Month|day|weekday|Elapsed Time|Moving Time|Distance|Max Heart Rate|Average Heart Rate|       Activity Gear|         Max Speed|     Average Speed|Max Cadence|Average Cadence|Calories|
+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+--------------------+------------------+------------------+-----------+---------------+--------+
|2024-02-26 16:43:31|2024|    2| 26|      0|        2721|     2721.0|    7.06|          NULL|              NULL|HOKA Clifton 9 Po...|           15.0624|            9.3492|       88.0|           81.0|   550.0|
|2024-03-30 16:07:05|2024|    3| 30|      5|        3735|     3735.0|   10.08|          NULL|              NULL|HOKA Clifton 9 Po...|           14.1984|            

In [25]:
run_df.toPandas().to_csv('data/raw/raw_run_data.csv', index=False)

In [26]:
bike_df = activites_df.filter(activites_df['Activity Type'] == 'Ride') \
    .select('Activity Date', 'Year', 'Month', 'day', 'weekday', 'Elapsed Time', 'Moving Time', 'Distance', 'Max Heart Rate', 'Average Heart Rate', 'Activity Gear', 'Max Speed', 'Average Speed', 'Max Cadence', 'Average Cadence', 'Calories')

In [27]:
bike_df.show()

+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+--------------+------------------+------------------+-----------+---------------+--------+
|      Activity Date|Year|Month|day|weekday|Elapsed Time|Moving Time|Distance|Max Heart Rate|Average Heart Rate| Activity Gear|         Max Speed|     Average Speed|Max Cadence|Average Cadence|Calories|
+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+--------------+------------------+------------------+-----------+---------------+--------+
|2024-03-25 08:07:25|2024|    3| 25|      0|        8167|     7243.0|   45.19|          NULL|              NULL|Czarna strzała|33.076800000000006|22.464000000000002|       NULL|           NULL|  1113.0|
|2024-03-29 12:16:05|2024|    3| 29|      4|        5749|     4195.0|   18.19|          NULL|              NULL|Czarna strzała|           27.8352|15.613199999999999|       NULL|           

In [28]:
bike_df.toPandas().to_csv('data/raw/raw_bike_data.csv', index=False)

In [29]:
swim_df = activites_df.filter(activites_df['Activity Type'] == 'Swim') \
    .select('Activity Date', 'Year', 'Month', 'day', 'weekday', 'Elapsed Time', 'Moving Time', 'Distance', 'Max Heart Rate', 'Average Heart Rate', 'Max Speed', 'Average Speed', 'Max Cadence', 'Average Cadence', 'Calories')

In [30]:
swim_df.show()

+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+------------------+------------------+-----------+---------------+--------+
|      Activity Date|Year|Month|day|weekday|Elapsed Time|Moving Time|Distance|Max Heart Rate|Average Heart Rate|         Max Speed|     Average Speed|Max Cadence|Average Cadence|Calories|
+-------------------+----+-----+---+-------+------------+-----------+--------+--------------+------------------+------------------+------------------+-----------+---------------+--------+
|2024-03-25 06:01:54|2024|    3| 25|      0|        2637|     2631.0|   1,250|          NULL|              NULL|            5.2992|              1.71|       NULL|           21.0|   315.0|
|2024-03-07 17:03:21|2024|    3|  7|      3|        2688|     2629.0|     700|          NULL|              NULL| 5.630400000000001|0.9576000000000001|       NULL|           17.0|   282.0|
|2024-04-04 16:01:19|2024|    4|  4|      3|        2670|   

In [31]:
swim_df.toPandas().to_csv('data/raw/raw_swim_data.csv', index=False)