In [0]:
spark.conf.set("fs.azure.account.key.tokyoolympicdata99.dfs.core.windows.net","cdXZOr2G1ZwwvTAMKDDUKBhFht6lR8TJbfUfvpecw9NntXlocw1FJlhWE2cd+ZgQuAE6+d/ZlHnb+AStlnfOZQ==")



In [0]:
# Notebook 1: Data Quality Checks

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, lit, expr

# Initialize Spark Session
spark = SparkSession.builder.appName("Data Quality Checks").getOrCreate()

# Define paths for datasets
athletes_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/raw-data/Athletes.csv"
coaches_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/raw-data/Coaches.csv"
entriesgender_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/raw-data/EntriesGender.csv"
medals_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/raw-data/Medals.csv"
teams_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/raw-data/Teams.csv"

# Load datasets
athletes_df = spark.read.csv(athletes_path, header=True, inferSchema=True)
coaches_df = spark.read.csv(coaches_path, header=True, inferSchema=True)
entriesgender_df = spark.read.csv(entriesgender_path, header=True, inferSchema=True)
medals_df = spark.read.csv(medals_path, header=True, inferSchema=True)
teams_df = spark.read.csv(teams_path, header=True, inferSchema=True)


In [0]:
# 1. Null Checks
def perform_null_checks(df, df_name):
    null_counts = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns])
    null_counts.show()
    return null_counts

null_athletes = perform_null_checks(athletes_df, "athletes")
null_coaches = perform_null_checks(coaches_df, "coaches")

+----+---+----------+
|Name|NOC|Discipline|
+----+---+----------+
|   0|  0|         0|
+----+---+----------+

+----+---+----------+-----+
|Name|NOC|Discipline|Event|
+----+---+----------+-----+
|   0|  0|         0|  145|
+----+---+----------+-----+



In [0]:
# 2. Duplicate Checks
def perform_duplicate_checks(df, df_name, subset_cols):
    duplicate_count = df.groupBy(subset_cols).count().filter(col("count") > 1)
    duplicate_count.show()
    return duplicate_count

duplicate_athletes = perform_duplicate_checks(athletes_df, "athletes", ["Name", "NOC", "Discipline"])

+-----------+-------+----------+-----+
|       Name|    NOC|Discipline|count|
+-----------+-------+----------+-----+
|ALI Mohamed|Bahrain|  Handball|    2|
+-----------+-------+----------+-----+



In [0]:
# 3. Range Checks
def perform_range_checks(df, column, min_val, max_val, df_name):
    invalid_range = df.filter((col(column) < min_val) | (col(column) > max_val))
    invalid_range.show()
    return invalid_range

range_entriesgender = perform_range_checks(entriesgender_df, "Total", 0, 5000, "entriesgender")


+----------+------+----+-----+
|Discipline|Female|Male|Total|
+----------+------+----+-----+
+----------+------+----+-----+



In [0]:
def validate_medal_counts(df):
    # Check for negative values in Gold, Silver, and Bronze columns
    invalid_medals = df.filter((col("Gold") < 0) | (col("Silver") < 0) | (col("Bronze") < 0))
    invalid_medals.show()
    return invalid_medals


# Validate medal counts in the medals dataset
invalid_medals = validate_medal_counts(medals_df)

# Handle the invalid_medals DataFrame if needed
if invalid_medals.count() > 0:
    print(f"Found {invalid_medals.count()} rows with invalid medal counts.")
else:
    print("No invalid medal counts found.")


+----+--------+----+------+------+-----+-------------+
|Rank|Team/NOC|Gold|Silver|Bronze|Total|Rank by Total|
+----+--------+----+------+------+-----+-------------+
+----+--------+----+------+------+-----+-------------+

No invalid medal counts found.


In [0]:
# 5. Negative Values Check
def check_negative_values(df, column, df_name):
    negative_values = df.filter(col(column) < 0)
    negative_values.show()
    return negative_values

negative_gold = check_negative_values(medals_df, "Gold", "medals")

+----+--------+----+------+------+-----+-------------+
|Rank|Team/NOC|Gold|Silver|Bronze|Total|Rank by Total|
+----+--------+----+------+------+-----+-------------+
+----+--------+----+------+------+-----+-------------+



In [0]:
# 6. Data Type Validation
def perform_data_type_validation(df, column, expected_type, df_name):
    invalid_data_type = df.filter(~col(column).cast(expected_type).isNotNull())
    invalid_data_type.show()
    return invalid_data_type

invalid_entriesgender_total = perform_data_type_validation(entriesgender_df, "Total", "int", "entriesgender")


+----------+------+----+-----+
|Discipline|Female|Male|Total|
+----------+------+----+-----+
+----------+------+----+-----+



In [0]:
# 7. Referential Integrity
def check_referential_integrity(parent_df, child_df, parent_col, child_col, df_name):
    missing_references = child_df.join(parent_df, parent_df[parent_col] == child_df[child_col], "left_anti")
    missing_references.show()
    return missing_references

missing_noc_references = check_referential_integrity(teams_df, athletes_df, "NOC", "NOC", "athletes_teams")


+--------------------+--------------------+-------------+
|                Name|                 NOC|   Discipline|
+--------------------+--------------------+-------------+
|       ABASS Abobakr|               Sudan|     Swimming|
| ABDALRASOOL Mohamed|               Sudan|         Judo|
|ABDUL RAZZAQ Fath...|            Maldives|    Badminton|
|       ABEBE Mekides|            Ethiopia|    Athletics|
|       ABELA Matthew|               Malta|    Badminton|
|  ABEYSINGHE Matthew|           Sri Lanka|     Swimming|
|     ABIDINE Abidine|          Mauritania|    Athletics|
|         ABOUD Hadel|               Libya|    Athletics|
| ABOUKE Nancy Genzel|               Nauru|Weightlifting|
|       ABRAMS Aliyah|              Guyana|    Athletics|
|      ABRAMS Jasmine|              Guyana|    Athletics|
|      ABRAMYAN Benik|             Georgia|    Athletics|
|      ABU RABEE Asma|              Jordan|     Shooting|
|    ABU RMILAH Wesam|           Palestine|         Judo|
|  ACHILLEOS G

In [0]:
from pyspark.sql.functions import lower

# 8. Consistency Check
def perform_consistency_checks(df, column, df_name):
    inconsistent_records = df.filter(col(column) != lower(col(column)))
    inconsistent_records.show()
    return inconsistent_records

# Example usage for checking inconsistent athlete names
inconsistent_athlete_names = perform_consistency_checks(athletes_df, "Name", "athletes")


+--------------------+--------------------+-------------------+
|                Name|                 NOC|         Discipline|
+--------------------+--------------------+-------------------+
|     AALERUD Katrine|              Norway|       Cycling Road|
|         ABAD Nestor|               Spain|Artistic Gymnastics|
|   ABAGNALE Giovanni|               Italy|             Rowing|
|      ABALDE Alberto|               Spain|         Basketball|
|       ABALDE Tamara|               Spain|         Basketball|
|           ABALO Luc|              France|           Handball|
|        ABAROA Cesar|               Chile|             Rowing|
|       ABASS Abobakr|               Sudan|           Swimming|
|    ABBASALI Hamideh|Islamic Republic ...|             Karate|
|       ABBASOV Islam|          Azerbaijan|          Wrestling|
|        ABBINGH Lois|         Netherlands|           Handball|
|         ABBOT Emily|           Australia|Rhythmic Gymnastics|
|       ABBOTT Monica|United States of .

In [0]:
# 9. Outlier Detection
def detect_outliers(df, column, df_name):
    stats = df.selectExpr(f"percentile({column}, 0.25) as q1", f"percentile({column}, 0.75) as q3").collect()
    q1, q3 = stats[0]["q1"], stats[0]["q3"]
    iqr = q3 - q1
    lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    outliers = df.filter((col(column) < lower_bound) | (col(column) > upper_bound))
    outliers.show()
    return outliers

outliers_total = detect_outliers(entriesgender_df, "Total", "entriesgender")

+----------+------+----+-----+
|Discipline|Female|Male|Total|
+----------+------+----+-----+
| Athletics|   969|1072| 2041|
|  Football|   264| 344|  608|
|  Swimming|   361| 418|  779|
+----------+------+----+-----+



In [0]:
# 10. Null Percentage Check
def null_percentage_check(df, column, threshold, df_name):
    total_count = df.count()
    null_count = df.filter(col(column).isNull()).count()
    null_percentage = (null_count / total_count) * 100
    if null_percentage > threshold:
        print(f"Column {column} in {df_name} exceeds {threshold}% null values.")
    else:
        print(f"Column {column} in {df_name} is within acceptable null percentage.")
    return null_percentage

null_percentage_total = null_percentage_check(athletes_df, "Name", 5, "athletes")

Column Name in athletes is within acceptable null percentage.


In [0]:
# Consolidating quality metrics into a DataFrame
from pyspark.sql import SparkSession

# Ensure Spark session is active
spark = SparkSession.builder.appName("Data Quality").getOrCreate()

quality_metrics_data = [
    ("Athletes", "Missing Values", athletes_df.filter(athletes_df['Name'].isNull()).count()),
    ("Athletes", "Duplicates", athletes_df.groupBy(["Name", "NOC", "Discipline"]).count().filter(col("count") > 1).count()),
    ("Medals", "Missing Values", medals_df.filter(medals_df['Team/NOC'].isNull()).count()),
    ("Medals", "Negative Gold", medals_df.filter(medals_df['Gold'] < 0).count()),
    # Add more metrics here as needed
]

# Create DataFrame for metrics
quality_metrics_df = spark.createDataFrame(quality_metrics_data, ["Table", "Issue", "Count"])

# Display for validation
quality_metrics_df.show()


+--------+--------------+-----+
|   Table|         Issue|Count|
+--------+--------------+-----+
|Athletes|Missing Values|    0|
|Athletes|    Duplicates|    1|
|  Medals|Missing Values|    0|
|  Medals| Negative Gold|    0|
+--------+--------------+-----+



In [0]:
# Define path for consolidated metrics
consolidated_metrics_path = "abfss://tokyo-olympic-data@tokyoolympicdata99.dfs.core.windows.net/quality-metrics/consolidated_metrics.csv"


# Save metrics DataFrame as CSV
quality_metrics_df.write.format("csv").mode("overwrite").option("header", "true").save(consolidated_metrics_path)



In [0]:
# Trigger Notebook 2
result = dbutils.notebook.run("Store_Quality_Metrics", 300, {
    "metrics_path": consolidated_metrics_path,
    "trigger_flag": "true"
})
print("Notebook 2 Result: ", result)


Notebook 2 Result:  None
