In [1]:
%load_ext autoreload
%autoreload 2
import boto3
import requests
import urllib.request
import httpx
import os
import pandas as pd
import pyspark
import json
import pyarrow
import logging

from GetWeather import Import_Weather_Data
from SetupWeatherData import Setup_Weather_Data
from Upload_Weather_Data import UploadWeatherData
from ExportWeatherData import Export_Weather_Data

from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql import Row
from pyspark.sql.window import Window
from py4j.protocol import Py4JJavaError
from datetime import datetime, timedelta
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, BooleanType

In [2]:
logging.basicConfig(
    level = logging.INFO,
    format= '%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [3]:
# Hello Github

In [4]:
# Load my .env files for API keys

# Get Current Directory
current_directory = os.getcwd()

# Load my scripts for API keys
base_dir = os.path.abspath(os.path.join(current_directory, "../../sensitive_data"))
print(base_dir)

weather_env_path = os.path.join(base_dir, "weather_api.env")
aws_env_path = os.path.join(base_dir, "aws_info.env")
google_env_path = os.path.join(base_dir, "google_info.env")
azure_env_path = os.path.join(base_dir, "azure_info.env")

load_dotenv(dotenv_path=weather_env_path)
load_dotenv(dotenv_path=aws_env_path)
load_dotenv(dotenv_path=google_env_path)
load_dotenv(dotenv_path=azure_env_path)

/home/cephuez/sensitive_data


True

In [5]:
spark = SparkSession.builder.appName("Weather_Session").getOrCreate()

25/05/28 19:15:37 WARN Utils: Your hostname, DESKTOP-J91G8VC resolves to a loopback address: 127.0.1.1; using 172.19.120.149 instead (on interface eth0)
25/05/28 19:15:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/28 19:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
timestamp = datetime.now().strftime("%Y-%B-%d_%H-%M")
weather_data = Import_Weather_Data(timestamp)
filename = weather_data.get_filename()

In [7]:
setup_weather_data = Setup_Weather_Data(spark, filename, timestamp)
df = setup_weather_data.get_data_frame()

Hi


                                                                                

In [8]:
weather_result_filename = setup_weather_data.get_weather_result_filename()
logger.info(f"Output folder: {weather_result_filename}")
#print(weather_result_filename)

2025-05-28 19:15:52,071 - INFO - Output folder: weather_results/50_City_Results_2025-May-28_19-15


In [9]:
#df.show()

In [10]:
export_data = Export_Weather_Data(weather_result_filename)

hi


In [11]:
# Create location DF
location_df = df.select(
    df["id"].alias("ID"),
    df["name"].alias("City"),
    df["sys.country"].alias("Country"),
    df["coord.lat"].alias("Latitude"),
    df["coord.lon"].alias("Longitude")
).orderBy("ID")

In [12]:
# Create Temperature & Pressure Table
temperature_df = df.select(
    df["id"].alias("City_ID"),
    df["main.temp"].alias("Temp"),
    df["main.temp_max"].alias("Temp_Max"),
    df["main.temp_min"].alias("Temp_Min"),
    df["main.feels_like"].alias("Feels_Like"),
    df["main.humidity"].alias("Humidity"),
    df["main.pressure"].alias("Pressure"),
    df["main.sea_level"].alias("Sea_Level")
).orderBy("City_ID")

In [13]:
# Create Wind & Clouds Table
wind_df = df.select(
    df["id"].alias("City_ID"),
    df["clouds.all"].alias("Cloudiness_Percentage"),
    df["wind.deg"].alias("Wind_Direction_Degree"),
    df["wind.gust"].alias("Gust_Speed"),
    df["wind.speed"].alias("Wind_Speed")
).orderBy("City_ID")

In [14]:
# Create Weather Description
weather_desc_df = df.select(
    df["id"].alias("City_ID"),
    df["weather"][0]["main"].alias("Main_Weather"),
    df["weather"][0]["description"].alias("Description"),
    df["weather"][0]["icon"].alias("Icon")
).orderBy("City_ID")

In [15]:
# Sunrise_Sunset_Table
sunrise_sunset_df = df.select(
    df["id"].alias("City_ID"),
    df["sys.sunrise"].alias("Sunrise"),
    df["sys.sunset"].alias("Sunset"),
    df["timezone"].alias("Timezone")
).orderBy("City_ID")

In [16]:
# Which cities have the longest daylight duration?
# Convert the time into readable time. Order by daylight hour
top_10_cities = sunrise_sunset_df.select(
    col("City_ID"), 
    date_format(from_unixtime(col("Sunrise") + col("Timezone")),"HH:mm:ss").alias("Sunrise"),
    date_format(from_unixtime(col("Sunset") + col("Timezone")),"HH:mm:ss").alias("Sunset"), 
    round(((col("Sunset") - col("Sunrise")) / 3600),2).alias("Daylight_Hours"),
    col("Timezone")).orderBy(col("Daylight_Hours").desc()).limit(10)

final_table = top_10_cities.join(location_df, top_10_cities["City_ID"] == location_df["ID"]
                ).select(location_df["ID"], 
                         location_df["City"], 
                         location_df["Country"], 
                         top_10_cities["Sunrise"], 
                         top_10_cities["Sunset"], 
                         top_10_cities["Daylight_Hours"], 
                         top_10_cities["Timezone"]
                ).orderBy(col("Daylight_Hours").desc())
#final_table.show()

final_table.createOrReplaceTempView("Final_Table")

query = '''
        SELECT ID, CITY, COUNTRY, SUNRISE, SUNSET, DAYLIGHT_HOURS, TIMEZONE, RANK()OVER(ORDER BY DAYLIGHT_HOURS DESC) RANK
        FROM FINAL_TABLE
    '''
result = spark.sql(query)
result.show()


data = result.toPandas()

export_data.to_parquet(data, 'Longest_Daytime.parquet')
export_data.to_csv(data, 'Longest_Daytime.csv')
export_data.to_json(data, 'Longest_Daytime.json')

#path = os.path.join(weather_result_filename, 'Longest_Daytime.parquet')
#data.to_parquet(path, engine='pyarrow') 
#path = os.path.join(weather_result_filename, 'Longest_Daytime.csv')
#data.to_csv(path, index=False)
#path = os.path.join(weather_result_filename, 'Longest_Daytime.json')
#data.to_json(path, orient='records', lines=True)

25/05/28 19:15:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+----------+-------+--------+--------+--------------+--------+----+
|     ID|      CITY|COUNTRY| SUNRISE|  SUNSET|DAYLIGHT_HOURS|TIMEZONE|RANK|
+-------+----------+-------+--------+--------+--------------+--------+----+
|3413829| Reykjavik|     IS|21:30:16|17:20:06|         19.83|       0|   1|
| 658225|  Helsinki|     FI|22:12:36|16:23:04|         18.17|   10800|   2|
|3143244|      Oslo|     NO|22:11:44|16:17:28|          18.1|    7200|   3|
|2673730| Stockholm|     SE|21:47:45|15:42:54|         17.92|    7200|   4|
| 524901|    Moscow|     RU|21:56:43|14:57:31|         17.01|   10800|   5|
|2618425|Copenhagen|     DK|22:37:20|15:37:19|          17.0|    7200|   6|
|2964574|    Dublin|     IE|23:06:35|15:38:44|         16.54|    3600|   7|
|2950159|    Berlin|     DE|22:52:21|15:15:32|         16.39|    7200|   8|
|2759794| Amsterdam|     NL|23:27:12|15:48:51|         16.36|    7200|   9|
| 756135|    Warsaw|     PL|22:23:31|14:43:34|         16.33|    7200|  10|
+-------+---

25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 1

In [17]:
# Which city has the highest difference between actual temperature and feels-like temperature?
# temperature_df
top_10_cities = temperature_df.select(
    col("City_ID"), 
    col("Temp"), 
    col("Feels_Like"), 
    round(abs(col("Temp") - col("Feels_Like")),2).alias("Difference")
        ).orderBy(col("Difference").desc()).limit(10)
#top_10_cities.show()

final_top_10_cities_temperature = top_10_cities.join(location_df, top_10_cities["City_ID"] == location_df["ID"]).select(
    top_10_cities["City_ID"], 
    location_df["City"], 
    location_df["Country"], 
    top_10_cities["Temp"], 
    top_10_cities["Feels_Like"], 
    top_10_cities["Difference"]
        ).orderBy(col("Difference").desc())
#final_top_10_cities_temperature.show()

final_top_10_cities_temperature.createOrReplaceTempView("Final_Table")

query = '''
        SELECT CITY_ID, CITY, COUNTRY, TEMP, FEELS_LIKE, DIFFERENCE, RANK()OVER(ORDER BY DIFFERENCE DESC) RANK
        FROM FINAL_TABLE
    '''

final_result = spark.sql(query)
final_result.show()

data = final_result.toPandas()

export_data.to_parquet(data, 'Temperature_Feel_Like_Temperature_Diff.parquet')
export_data.to_csv(data, 'Temperature_Feel_Like_Temperature_Diff.csv')
export_data.to_json(data, 'Temperature_Feel_Like_Temperature_Diff.json')

25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+------------+-------+-----+----------+----------+----+
|CITY_ID|        CITY|COUNTRY| TEMP|FEELS_LIKE|DIFFERENCE|RANK|
+-------+------------+-------+-----+----------+----------+----+
|1701668|      Manila|     PH|30.51|     37.44|      6.93|   1|
|1609350|     Bangkok|     TH|27.94|      33.9|      5.96|   2|
|1880252|   Singapore|     SG|30.03|     35.61|      5.58|   3|
|1642911|     Jakarta|     ID|28.98|     34.47|      5.49|   4|
| 292223|       Dubai|     AE|29.96|     34.02|      4.06|   5|
|3435910|Buenos Aires|     AR| 8.79|      5.09|       3.7|   6|
|2332459|       Lagos|     NG|27.55|      31.0|      3.45|   7|
|1735161|Kuala Lumpur|     MY| 26.9|     29.43|      2.53|   8|
| 108410|      Riyadh|     SA|28.98|     27.35|      1.63|   9|
|3413829|   Reykjavik|     IS| 7.16|      5.55|      1.61|  10|
+-------+------------+-------+-----+----------+----------+----+



25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 19:15:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/28 1

In [18]:
weather_uploader = UploadWeatherData(weather_result_filename)

In [19]:
# Upload to AWS
weather_uploader.upload_to_AWS()

2025-05-28 19:15:55,096 - INFO - Found credentials in shared credentials file: ~/.aws/credentials


File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.parquet uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.csv uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.parquet uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.json uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.json uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.csv uploaded


In [20]:
weather_uploader.upload_to_Google()

In [21]:
weather_uploader.upload_to_Azure()

2025-05-28 19:15:58,479 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.parquet'
Request method: 'PUT'
Request headers:
    'Content-Length': '5258'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.1 Python/3.10.17 (Linux-6.6.87.1-microsoft-standard-WSL2-x86_64-with-glibc2.39)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '7a30b164-3c2a-11f0-bba4-00155d09d405'
    'Authorization': 'REDACTED'
A body is sent with the request


Azure


2025-05-28 19:15:58,830 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Thu, 29 May 2025 01:15:58 GMT'
    'ETag': '"0x8DD9E4E5EB0B921"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'a954cb14-601e-0031-3137-d048ae000000'
    'x-ms-client-request-id': '7a30b164-3c2a-11f0-bba4-00155d09d405'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Thu, 29 May 2025 01:15:58 GMT'
2025-05-28 19:15:58,832 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.csv'
Request method: 'PUT'
Request headers:
    'Content-Length': '5795'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blo

File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.parquet uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.csv uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.parquet uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Longest_Daytime.json uploaded


2025-05-28 19:15:59,095 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Thu, 29 May 2025 01:15:59 GMT'
    'ETag': '"0x8DD9E4E5EDA0BA9"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'a954ccb3-601e-0031-2937-d048ae000000'
    'x-ms-client-request-id': '7a30b168-3c2a-11f0-bba4-00155d09d405'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Thu, 29 May 2025 01:15:58 GMT'
2025-05-28 19:15:59,096 - INFO - Request URL: 'https://sspcloudstorage00001.blob.core.windows.net/sspbucket/weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.csv'
Request method: 'PUT'
Request headers:
    'Content-Length': '5258'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'a

File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.json uploaded
File: weather_results/50_City_Results_2025-May-28_19-15/Temperature_Feel_Like_Temperature_Diff.csv uploaded
