In [5]:
!pip install pyspark==4.0.0



In [6]:
import os

os.makedirs('data', exist_ok=True)

In [7]:
 #!/bin/bash
!curl -L -o ./data/gdp-countries.zip \
    "https://www.kaggle.com/api/v1/datasets/download/nitishabharathi/gdp-per-capita-all-countries"

!curl -L -o ./data/marine-microplastic.zip \
    "https://www.kaggle.com/api/v1/datasets/download/william2020/marine-microplastics"

!curl -L -o ./data/food-microplastic.zip \
    "https://www.kaggle.com/api/v1/datasets/download/jayeshrmohanani/dataset-for-microplastic-consumption-in-food-items"

!curl -L -o ./data/life-exp-countries.zip \
    "https://www.kaggle.com/api/v1/datasets/download/sahirmaharajj/country-health-trends-dataset"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 44393  100 44393    0     0  38407      0  0:00:01  0:00:01 --:--:-- 38407
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1681k  100 1681k    0     0  1280k      0  0:00:01  0:00:01 --:--:-- 4946k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  115k  100  115k    0     0   137k      0 --:--:-- --:--:-- --:--:--  398k
  % Total    % Received % Xferd  Average Speed   Tim

In [8]:
import zipfile
import os

def unzip_dataset(dataset_name):
    try:
        with zipfile.ZipFile(f'./data/{dataset_name}.zip', 'r') as zip_ref:
            zip_ref.extractall('./data/')
            print("Extracted marine microplastic dataset")
    except zipfile.BadZipFile:
        print(f"{dataset_name} file is not a valid zip")

unzip_dataset('marine-microplastic')
unzip_dataset('life-exp-countries')
unzip_dataset('gdp-countries')
unzip_dataset('food-microplastic')

# List all files in data directory
print("\nAll files in data directory:")
for file in os.listdir('./data/'):
    print(f"- {file}")

Extracted marine microplastic dataset
Extracted marine microplastic dataset
Extracted marine microplastic dataset
Extracted marine microplastic dataset

All files in data directory:
- train.csv
- marine-microplastic.zip
- gdp-countries.zip
- food-microplastic.zip
- GDP.csv
- life-exp-countries.zip
- processed_microplastics.csv
- Marine_Microplastics.csv
- gapminder.csv


In [9]:
from pyspark.sql import SparkSession

# Create Spark session≥≥≥≥
spark = SparkSession.builder \
    .appName("MicroplasticsAnalysis") \
    .getOrCreate()

In [12]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Define explicit schemas for better performance and type safety
ocean_schema = StructType([
    StructField("OBJECTID", IntegerType(), True),
    StructField("Oceans", StringType(), True),
    StructField("Regions", StringType(), True),
    StructField("SubRegions", StringType(), True),
    StructField("Sampling Method", StringType(), True),
    StructField("Measurement", DoubleType(), True),
    StructField("Unit", StringType(), True),
    StructField("Density Range", StringType(), True),
    StructField("Density Class", StringType(), True),
    StructField("Short Reference", StringType(), True),
    StructField("Long Reference", StringType(), True),
    StructField("DOI", StringType(), True),
    StructField("Organization", StringType(), True),
    StructField("Keywords", StringType(), True),
    StructField("Accession Number", StringType(), True),
    StructField("Accession Link", StringType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("Longitude", DoubleType(), True),
    StructField("Date", StringType(), True),
    StructField("GlobalID", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True)
])

food_schema = StructType([
    StructField("Country", StringType(), True),
    StructField("Food_Type", StringType(), True),
    StructField("Microplastic_Density", DoubleType(), True),
    StructField("Unit", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Source", StringType(), True)
])

health_trends_schema = StructType([
    StructField("Country", StringType(), True),
    StructField("LifeExpectancy", DoubleType(), True),
    StructField("FertilityRate", DoubleType(), True),
    StructField("Population", LongType(), True),
    StructField("Region", StringType(), True)
])

gdp_schema = StructType([
           StructField("Country", StringType(), True),
           StructField("Country Code", StringType(), True)
       ] + [StructField(str(year), DoubleType(), True) for year in range(1990, 2020)])

# Create Bronze layer with explicit schemas
df_ocean = spark.read.csv("data/Marine_Microplastics.csv",
                        header=True,
                        schema=ocean_schema)

df_food = spark.read.csv("data/processed_microplastics.csv",
                        header=True,
                        schema=food_schema)

df_health_trends = spark.read.csv("data/gapminder.csv",
                        header=True,
                        schema=health_trends_schema)

df_gdp = spark.read.csv("data/GDP.csv",
                        header=True,
                        schema=gdp_schema)


In [ ]:
# Write Bronze layer to Parquet format
df_ocean.write.mode("overwrite").parquet("data/bronze/ocean_microplastics")
df_food.write.mode("overwrite").parquet("data/bronze/food_microplastics") 
df_gdp.write.mode("overwrite").parquet("data/bronze/gdp_data")

print("Bronze layer saved to Parquet format")

In [28]:
# Create Silver Layer - Clean and transform data for analysis

# 1. Ocean Microplastics Silver Layer
print("Creating Ocean Microplastics Silver Layer...")

# Clean ocean data: remove nulls, standardize units, add derived columns
df_ocean_silver = df_ocean.filter(
    (col("Measurement").isNotNull()) & 
    (col("Latitude").isNotNull()) & 
    (col("Longitude").isNotNull()) &
    (col("Unit") == "pieces/m3")  # Standardize on pieces/m3 unit
).select(
    col("OBJECTID").alias("sample_id"),
    col("Oceans").alias("ocean"),
    col("Measurement").alias("microplastic_density"),
)

print(f"Ocean silver layer rows: {df_ocean_silver.count()}")
df_ocean_silver.show(5)

# 2. Food Microplastics Silver Layer  
print("\nCreating Food Microplastics Silver Layer...")

# The food data appears to be malformed - let's examine it first
df_food.printSchema()
print("Sample food data:")
df_food.show(5)


Creating Ocean Microplastics Silver Layer...
Ocean silver layer rows: 14338
+---------+--------------+--------------------+
|sample_id|         ocean|microplastic_density|
+---------+--------------+--------------------+
|     9676|Atlantic Ocean|               0.018|
|     6427| Pacific Ocean|                 0.0|
|    10672| Pacific Ocean|               0.013|
|    13921|Atlantic Ocean|              1368.0|
|     9344| Pacific Ocean|               0.001|
+---------+--------------+--------------------+
only showing top 5 rows

Creating Food Microplastics Silver Layer...
root
 |-- Country: string (nullable = true)
 |-- Food_Type: string (nullable = true)
 |-- Microplastic_Density: double (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Source: string (nullable = true)

Sample food data:
+-------+--------------------+--------------------+-----------+----+-----------+
|Country|           Food_Type|Microplastic_Density|       Unit|Year|     So

25/06/12 23:44:29 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 21, schema size: 6
CSV file: file:///home/agrodowski/Desktop/MIM/DE/FINAL-PROJ/data/processed_microplastics.csv


In [None]:
# Continue Silver Layer Creation

# 3. Food Microplastics Silver Layer (continued)
print("Creating proper food microplastics silver layer...")

# Read the actual food microplastics file (train.csv)
df_food_raw = spark.read.csv("data/train.csv", header=True, inferSchema=True)
print("Food data schema:")
df_food_raw.printSchema()
print("Food data sample:")
df_food_raw.show(5)

# Clean and transform food data
df_food_silver = df_food_raw.filter(
    col("Microplastic_Count").isNotNull()
).select(
    col("Country").alias("country"),
    col("Food_Type").alias("food_type"),
    col("Microplastic_Count").alias("microplastic_count"),
    col("Sample_Size_g").alias("sample_size_grams"),
    col("Detection_Method").alias("detection_method"),
    col("Study_Year").alias("study_year"),
    col("Source_Type").alias("source_type")
).withColumn(
    "microplastic_density_per_gram", 
    col("microplastic_count") / col("sample_size_grams")
).withColumn("data_source", lit("food"))

# Add pollution categories for food
df_food_silver = df_food_silver.withColumn(
    "contamination_level",
    when(col("microplastic_density_per_gram") < 0.1, "Low")
    .when(col("microplastic_density_per_gram") < 1.0, "Medium")
    .when(col("microplastic_density_per_gram") < 10.0, "High")
    .otherwise("Very High")
)

print(f"Food silver layer rows: {df_food_silver.count()}")
df_food_silver.show(5)

# 4. GDP Silver Layer
print("\nCreating GDP Silver Layer...")

# Transform GDP data from wide to long format
gdp_years = [str(year) for year in range(1990, 2020)]

# Melt GDP data to long format
df_gdp_long = df_gdp.select(
    col("Country").alias("country"),
    col("Country Code").alias("country_code"),
    *[col(year) for year in gdp_years]
)

# Create long format by unpivoting
from pyspark.sql.functions import stack, lit as spark_lit

gdp_expressions = []
for year in gdp_years:
    gdp_expressions.extend([spark_lit(int(year)), col(year)])

df_gdp_silver = df_gdp_long.select(
    col("country"),
    col("country_code"),
    expr(f"stack({len(gdp_years)}, {', '.join([f'cast({year} as int), `{year}`' for year in gdp_years])}) as (year, gdp_per_capita)")
).filter(col("gdp_per_capita").isNotNull())

# Add development level categories based on GDP
df_gdp_silver = df_gdp_silver.withColumn(
    "development_level",
    when(col("gdp_per_capita") < 1000, "Low Income")
    .when(col("gdp_per_capita") < 4000, "Lower Middle Income")
    .when(col("gdp_per_capita") < 12000, "Upper Middle Income")
    .otherwise("High Income")
)

print(f"GDP silver layer rows: {df_gdp_silver.count()}")
df_gdp_silver.show(5)

In [None]:
# Data Quality Checks and Silver Layer Validation

print("=== SILVER LAYER SUMMARY ===\n")

# Ocean data quality checks
print("1. OCEAN MICROPLASTICS SILVER LAYER")
print(f"   Total records: {df_ocean_silver.count()}")
print("   Data quality metrics:")
print(f"   - Records with valid coordinates: {df_ocean_silver.filter((col('latitude').isNotNull()) & (col('longitude').isNotNull())).count()}")
print(f"   - Records with valid dates: {df_ocean_silver.filter(col('sample_date').isNotNull()).count()}")
print(f"   - Unique oceans: {df_ocean_silver.select('ocean').distinct().count()}")

print("\n   Pollution level distribution:")
df_ocean_silver.groupBy("pollution_level").count().orderBy("count", ascending=False).show()

print("\n   Density statistics:")
df_ocean_silver.select("microplastic_density").describe().show()

# Food data quality checks  
print("\n2. FOOD MICROPLASTICS SILVER LAYER")
print(f"   Total records: {df_food_silver.count()}")
print("   Data quality metrics:")
print(f"   - Records with valid microplastic counts: {df_food_silver.filter(col('microplastic_count').isNotNull()).count()}")
print(f"   - Unique countries: {df_food_silver.select('country').distinct().count()}")
print(f"   - Unique food types: {df_food_silver.select('food_type').distinct().count()}")

print("\n   Contamination level distribution:")
df_food_silver.groupBy("contamination_level").count().orderBy("count", ascending=False).show()

print("\n   Top 10 food types by average contamination:")
df_food_silver.groupBy("food_type").agg(
    avg("microplastic_density_per_gram").alias("avg_density"),
    count("*").alias("sample_count")
).filter(col("sample_count") >= 3).orderBy("avg_density", ascending=False).show(10)

# GDP data quality checks
print("\n3. GDP SILVER LAYER") 
print(f"   Total records: {df_gdp_silver.count()}")
print(f"   Year range: {df_gdp_silver.agg(min('year'), max('year')).collect()[0]}")
print(f"   Unique countries: {df_gdp_silver.select('country').distinct().count()}")

print("\n   Development level distribution:")
df_gdp_silver.groupBy("development_level").count().orderBy("count", ascending=False).show()

# Cache silver layer dataframes for better performance
df_ocean_silver.cache()
df_food_silver.cache() 
df_gdp_silver.cache()

print("\n=== SILVER LAYER CREATION COMPLETE ===")
print("All datasets have been cleaned, transformed, and validated.")
print("Ready for Gold layer aggregations and analysis.")