In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
credentials_location = '/Users/anzelam/ac/google/ac.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('GCSFilesRead') \
    .set("spark.jars", "../lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [3]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

24/04/13 21:47:53 WARN Utils: Your hostname, Anzela--MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.7.108 instead (on interface en0)
24/04/13 21:47:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/13 21:47:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [5]:
running_df = spark.read.csv('gs://de-running-project-bucket/raw/TWO_CENTURIES_OF_UM_RACES.csv', header=True)

In [7]:
running_df.head()

                                                                                

Row(Year of event='2018', Event dates='06.01.2018', Event name='Selva Costera (CHI)', Event distance/length='50km', Event number of finishers='22', Athlete performance='4:51:39 h', Athlete club='Tnfrc', Athlete country='CHI', Athlete year of birth='1978.0', Athlete gender='M', Athlete age category='M35', Athlete average speed='10.286', Athlete ID='0')

                                                                                

In [None]:
column_mapping = {
    "Year of event": "year_of_event",
    "Event dates": "event_dates",
    "Event name": "event_name",
    "Event distance/length": "event_distance_length",
    "Event number of finishers": "event_num_finishers",
    "Athlete performance": "athlete_performance",
    "Athlete club": "athlete_club",
    "Athlete country": "athlete_country",
    "Athlete year of birth": "athlete_year_of_birth",
    "Athlete gender": "athlete_gender",
    "Athlete age category": "athlete_age_category",
    "Athlete average speed": "athlete_average_speed",
    "Athlete ID": "athlete_id"
}

for old_col, new_col in column_mapping.items():
    running_df = running_df.withColumnRenamed(old_col, new_col)

In [None]:
running_df = running_df.withColumn(
    "event_type",
    when(col("event_distance_length").rlike(r"\d+[kKmMi]"), "Distance")
    .when(col("event_distance_length").rlike(r"\d+[dh]"), "Time")
    .otherwise("Unknown")
)

In [None]:
total_rows = running_df.count()
filtered_rows_distance = (
    running_df.groupBy("event_type")
        .agg(count("*").alias("count"))
        .withColumn("percentage", col("count") / lit(total_rows) * 100)
)
filtered_rows_distance.show(filtered_rows_distance.count(), truncate=False)

In [None]:
missing_values_per_column = running_df.select([col(c).isNull().alias(c) for c in running_df.columns])
missing_values_count = missing_values_per_column.agg(*[sum(col(c).cast("int")).alias(c) for c in missing_values_per_column.columns])

missing_values_percentage_raw = missing_values_count.select([(col(c) / total_rows * 100).alias(c) for c in running_df.columns])
 
missing_values_percentage = missing_values_percentage_raw.select([col(c).cast("int").alias(c) for c in running_df.columns])
missing_values_percentage.show()

In [None]:
max_reasonable_speed = 25.0  # km/h

running_df = running_df.withColumn(
    "athlete_average_speed",
    when(col("athlete_average_speed") <= max_reasonable_speed, col("athlete_average_speed"))
    .otherwise(col("athlete_average_speed") / 1000.0)  # Convert from m/s to km/h
)

In [8]:
spark.stop()