In [144]:
%load_ext autoreload
%autoreload 2
import boto3
import requests
import urllib.request
import httpx
import os
import pandas as pd
import pyspark
import json
import pyarrow
import logging

from GetWeather import Import_Weather_Data
from SetupWeatherData import Setup_Weather_Data
from Upload_Weather_Data import UploadWeatherData
from ExportWeatherData import Export_Weather_Data

from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql import Row
from pyspark.sql.window import Window
from py4j.protocol import Py4JJavaError
from datetime import datetime, timedelta
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, BooleanType

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [140]:
logging.basicConfig(
    level = logging.INFO,
    format= '%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [207]:
# Hello Github

In [203]:
# Load my .env files for API keys

# Get Current Directory
current_directory = os.getcwd()

# Load my scripts for API keys
base_dir = os.path.abspath(os.path.join(current_directory, "../../sensitive_data"))
print(base_dir)

weather_env_path = os.path.join(base_dir, "weather_api.env")
aws_env_path = os.path.join(base_dir, "aws_info.env")
google_env_path = os.path.join(base_dir, "google_info.env")
azure_env_path = os.path.join(base_dir, "azure_info.env")

load_dotenv(dotenv_path=weather_env_path)
load_dotenv(dotenv_path=aws_env_path)
load_dotenv(dotenv_path=google_env_path)
load_dotenv(dotenv_path=azure_env_path)

/home/cephuez/sensitive_data


True

In [161]:
spark = SparkSession.builder.appName("Weather_Session").getOrCreate()

In [164]:
timestamp = datetime.now().strftime("%Y-%B-%d_%H-%M")
weather_data = Import_Weather_Data(timestamp)
filename = weather_data.get_filename()

In [165]:
setup_weather_data = Setup_Weather_Data(spark, filename, timestamp)
df = setup_weather_data.get_data_frame()

Hi


In [166]:
weather_result_filename = setup_weather_data.get_weather_result_filename()
logger.info(f"Output folder: {weather_result_filename}")
#print(weather_result_filename)

2025-05-27 17:15:30,653 - INFO - Output folder: weather_results/50_City_Results_2025-May-27_17-15


In [167]:
#df.show()

In [168]:
export_data = Export_Weather_Data(weather_result_filename)

hi


In [169]:
# Create location DF
location_df = df.select(
    df["id"].alias("ID"),
    df["name"].alias("City"),
    df["sys.country"].alias("Country"),
    df["coord.lat"].alias("Latitude"),
    df["coord.lon"].alias("Longitude")
).orderBy("ID")

In [170]:
# Create Temperature & Pressure Table
temperature_df = df.select(
    df["id"].alias("City_ID"),
    df["main.temp"].alias("Temp"),
    df["main.temp_max"].alias("Temp_Max"),
    df["main.temp_min"].alias("Temp_Min"),
    df["main.feels_like"].alias("Feels_Like"),
    df["main.humidity"].alias("Humidity"),
    df["main.pressure"].alias("Pressure"),
    df["main.sea_level"].alias("Sea_Level")
).orderBy("City_ID")

In [171]:
# Create Wind & Clouds Table
wind_df = df.select(
    df["id"].alias("City_ID"),
    df["clouds.all"].alias("Cloudiness_Percentage"),
    df["wind.deg"].alias("Wind_Direction_Degree"),
    df["wind.gust"].alias("Gust_Speed"),
    df["wind.speed"].alias("Wind_Speed")
).orderBy("City_ID")

In [172]:
# Create Weather Description
weather_desc_df = df.select(
    df["id"].alias("City_ID"),
    df["weather"][0]["main"].alias("Main_Weather"),
    df["weather"][0]["description"].alias("Description"),
    df["weather"][0]["icon"].alias("Icon")
).orderBy("City_ID")

In [173]:
# Sunrise_Sunset_Table
sunrise_sunset_df = df.select(
    df["id"].alias("City_ID"),
    df["sys.sunrise"].alias("Sunrise"),
    df["sys.sunset"].alias("Sunset"),
    df["timezone"].alias("Timezone")
).orderBy("City_ID")

In [174]:
# Which cities have the longest daylight duration?
# Convert the time into readable time. Order by daylight hour
top_10_cities = sunrise_sunset_df.select(
    col("City_ID"), 
    date_format(from_unixtime(col("Sunrise") + col("Timezone")),"HH:mm:ss").alias("Sunrise"),
    date_format(from_unixtime(col("Sunset") + col("Timezone")),"HH:mm:ss").alias("Sunset"), 
    round(((col("Sunset") - col("Sunrise")) / 3600),2).alias("Daylight_Hours"),
    col("Timezone")).orderBy(col("Daylight_Hours").desc()).limit(10)

final_table = top_10_cities.join(location_df, top_10_cities["City_ID"] == location_df["ID"]
                ).select(location_df["ID"], 
                         location_df["City"], 
                         location_df["Country"], 
                         top_10_cities["Sunrise"], 
                         top_10_cities["Sunset"], 
                         top_10_cities["Daylight_Hours"], 
                         top_10_cities["Timezone"]
                ).orderBy(col("Daylight_Hours").desc())
#final_table.show()

final_table.createOrReplaceTempView("Final_Table")

query = '''
        SELECT ID, CITY, COUNTRY, SUNRISE, SUNSET, DAYLIGHT_HOURS, TIMEZONE, RANK()OVER(ORDER BY DAYLIGHT_HOURS DESC) RANK
        FROM FINAL_TABLE
    '''
result = spark.sql(query)
result.show()


data = result.toPandas()

export_data.to_parquet(data, 'Longest_Daytime.parquet')
export_data.to_csv(data, 'Longest_Daytime.csv')
export_data.to_json(data, 'Longest_Daytime.json')

#path = os.path.join(weather_result_filename, 'Longest_Daytime.parquet')
#data.to_parquet(path, engine='pyarrow') 
#path = os.path.join(weather_result_filename, 'Longest_Daytime.csv')
#data.to_csv(path, index=False)
#path = os.path.join(weather_result_filename, 'Longest_Daytime.json')
#data.to_json(path, orient='records', lines=True)

25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+----------+-------+--------+--------+--------------+--------+----+
|     ID|      CITY|COUNTRY| SUNRISE|  SUNSET|DAYLIGHT_HOURS|TIMEZONE|RANK|
+-------+----------+-------+--------+--------+--------------+--------+----+
|3413829| Reykjavik|     IS|21:35:47|17:14:04|         19.64|       0|   1|
| 658225|  Helsinki|     FI|22:14:19|16:21:06|         18.11|   10800|   2|
|3143244|      Oslo|     NO|22:13:24|16:15:33|         18.04|    7200|   3|
|2673730| Stockholm|     SE|21:49:21|15:41:03|         17.86|    7200|   4|
| 524901|    Moscow|     RU|21:57:56|14:56:02|         16.97|   10800|   5|
|2618425|Copenhagen|     DK|22:38:33|15:35:51|         16.96|    7200|   6|
|2964574|    Dublin|     IE|23:07:38|15:37:26|          16.5|    3600|   7|
|2950159|    Berlin|     DE|22:53:22|15:14:16|         16.35|    7200|   8|
|2759794| Amsterdam|     NL|23:28:12|15:47:36|         16.32|    7200|   9|
| 756135|    Warsaw|     PL|22:24:30|14:42:19|          16.3|    7200|  10|
+-------+---

25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 1

In [175]:
# Which city has the highest difference between actual temperature and feels-like temperature?
# temperature_df
top_10_cities = temperature_df.select(
    col("City_ID"), 
    col("Temp"), 
    col("Feels_Like"), 
    round(abs(col("Temp") - col("Feels_Like")),2).alias("Difference")
        ).orderBy(col("Difference").desc()).limit(10)
#top_10_cities.show()

final_top_10_cities_temperature = top_10_cities.join(location_df, top_10_cities["City_ID"] == location_df["ID"]).select(
    top_10_cities["City_ID"], 
    location_df["City"], 
    location_df["Country"], 
    top_10_cities["Temp"], 
    top_10_cities["Feels_Like"], 
    top_10_cities["Difference"]
        ).orderBy(col("Difference").desc())
#final_top_10_cities_temperature.show()

final_top_10_cities_temperature.createOrReplaceTempView("Final_Table")

query = '''
        SELECT CITY_ID, CITY, COUNTRY, TEMP, FEELS_LIKE, DIFFERENCE, RANK()OVER(ORDER BY DIFFERENCE DESC) RANK
        FROM FINAL_TABLE
    '''

final_result = spark.sql(query)
final_result.show()

data = final_result.toPandas()

export_data.to_parquet(data, 'Temperature_Feel_Like_Temperature_Diff.parquet')
export_data.to_csv(data, 'Temperature_Feel_Like_Temperature_Diff.csv')
export_data.to_json(data, 'Temperature_Feel_Like_Temperature_Diff.json')

25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+------------+-------+-----+----------+----------+----+
|CITY_ID|        CITY|COUNTRY| TEMP|FEELS_LIKE|DIFFERENCE|RANK|
+-------+------------+-------+-----+----------+----------+----+
|1701668|      Manila|     PH|28.37|     33.77|       5.4|   1|
|2332459|       Lagos|     NG|28.14|     32.62|      4.48|   2|
|1735161|Kuala Lumpur|     MY|27.57|     31.85|      4.28|   3|
|1609350|     Bangkok|     TH|26.94|     30.98|      4.04|   4|
|1880252|   Singapore|     SG|26.85|     29.16|      2.31|   5|
| 658225|    Helsinki|     FI| 9.73|      7.66|      2.07|   6|
| 108410|      Riyadh|     SA|31.02|     28.96|      2.06|   7|
|3143244|        Oslo|     NO| 6.98|      5.25|      1.73|   8|
| 292223|       Dubai|     AE|29.96|     31.65|      1.69|   9|
|  98182|     Baghdad|     IQ|28.95|     27.45|       1.5|  10|
+-------+------------+-------+-----+----------+----------+----+



25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 17:15:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/27 1

In [176]:
weather_uploader = UploadWeatherData(weather_result_filename)

Hello


In [179]:
# Upload to AWS
weather_uploader.upload_to_AWS()

us-west-1
project-cloud-saul2-00000002-data-cleaning
File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.parquet uploaded
File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.csv uploaded
File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.parquet uploaded
File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.json uploaded
File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.json uploaded
File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.csv uploaded


In [201]:
weather_uploader.upload_to_Google()

In [206]:
weather_uploader.upload_to_Azure()

2025-05-27 18:02:12,083 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.parquet'
Request method: 'PUT'
Request headers:
    'Content-Length': '5272'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.10.17 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.39)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '017092b8-3b57-11f0-bba4-00155d09dafc'
    'Authorization': 'REDACTED'
A body is sent with the request


Azure
loop


2025-05-27 18:02:12,421 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Wed, 28 May 2025 00:02:12 GMT'
    'ETag': '"0x8DD9D7AE5BEDDB6"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '8adf9cd2-301e-004e-7c63-cf8735000000'
    'x-ms-client-request-id': '017092b8-3b57-11f0-bba4-00155d09dafc'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Wed, 28 May 2025 00:02:11 GMT'
2025-05-27 18:02:12,423 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.csv'
Request method: 'PUT'
Request headers:
    'Content-Length': '5796'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blo

File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.parquet uploaded
loop
File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.csv uploaded
loop
File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.parquet uploaded
loop


2025-05-27 18:02:12,627 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Wed, 28 May 2025 00:02:12 GMT'
    'ETag': '"0x8DD9D7AE5DE0280"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '8adf9db9-301e-004e-4d63-cf8735000000'
    'x-ms-client-request-id': '017092bb-3b57-11f0-bba4-00155d09dafc'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Wed, 28 May 2025 00:02:11 GMT'
2025-05-27 18:02:12,628 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.json'
Request method: 'PUT'
Request headers:
    'Content-Length': '5272'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': '

File: weather_results/50_City_Results_2025-May-27_17-15/Longest_Daytime.json uploaded
loop
File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.json uploaded
loop
File: weather_results/50_City_Results_2025-May-27_17-15/Temperature_Feel_Like_Temperature_Diff.csv uploaded
