In [3]:
import os
import sys
from pyspark.sql import SparkSession

# Point the executor to the same Python interpreter the driver is using
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
spark = SparkSession.builder \
    .appName("WeatherLinearRegression") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
from pyspark.sql.functions import col, to_date, dayofyear
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
original_csv_path = "hdfs://localhost:9000/weather_data//weatherbigdata.csv"

In [None]:
df = spark.read.csv(original_csv_path, header=True, inferSchema=True)

In [None]:
# Step 2.1: Drop rows with null temperature values
clean_df = df.na.drop(subset=["TMAX", "TMIN"])

# Step 2.2: Transform temperature to actual degrees Celsius and add a 'range' column
transformed_df = clean_df.withColumn("TMAX_actual", col("TMAX") / 10.0) \
                   .withColumn("TMIN_actual", col("TMIN") / 10.0) \
                   .withColumn("TRANGE", col("TMAX_actual") - col("TMIN_actual"))

# Step 2.3: Add a data quality filter to ensure TMAX is greater than or equal to TMIN
validated_df = transformed_df.filter(col("TMAX_actual") >= col("TMIN_actual"))
selected_df = validated_df.select(
    col("DATE"),
    col("Latitude").cast("double"),
    col("Longitude").cast("double"),
    col("Elevation").cast("double"),
    col("TMAX_actual").cast("double")
).na.drop()

In [None]:
selected_df.show()

In [None]:
featured_df = selected_df.withColumn("DayOfYear", dayofyear(to_date(col("DATE"), "M/d/yyyy")))
feature_columns = ["Latitude", "Longitude", "Elevation", "DayOfYear"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
final_df = assembler.transform(featured_df)
final_df.show()

In [None]:
model_df = final_df.select("features",col("TMAX_actual").alias("label"))
model_df.show(truncate=False)

In [None]:
(training_data,test_data) = model_df.randomSplit([0.8,0.2], seed=42) #seed=42 fixes the randomsplit everytime i run this cell
lr = LinearRegression(featuresCol="features",labelCol="label")
model = lr.fit(training_data)

In [None]:
predictions = model.transform(test_data)

In [None]:
print("Predictions on test data:")
predictions.select("label", "prediction").show(10)

In [None]:
# Step 8: Evaluate the Model
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")
print(f"R-squared (R2) on test data: {r2}")

In [None]:
#Gradient-Boosted Tree (GBT)

In [None]:
from pyspark.sql.functions import col, month, dayofyear, to_date
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder \
    .appName("WeatherLinearRegression") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()

In [None]:
weather_schema = StructType([
    StructField("ID", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("TMAX", FloatType(), True),
    StructField("TMIN", FloatType(), True),
    StructField("EVAP", FloatType(), True),
    StructField("PRCP", FloatType(), True),
    StructField("Latitude", FloatType(), True),
    StructField("Longitude", FloatType(), True),
    StructField("Elevation", FloatType(), True)
])

In [None]:
original_csv_path = "hdfs://localhost:9000/weather_data/weatherbigdata.csv"
df = spark.read \
    .schema(weather_schema) \
    .option("header", True) \
    .csv(original_csv_path)

In [None]:
features_df = df \
    .withColumn("TMAX_actual", col("TMAX") / 10.0) \
    .withColumn("date_formatted", to_date(col("DATE"), "M/d/yyyy")) \
    .withColumn("month", month(col("date_formatted"))) \
    .withColumn("day_of_year", dayofyear(col("date_formatted")))
features_df = features_df.na.drop(subset=["TMAX_actual", "Latitude", "Longitude", "Elevation", "month", "day_of_year"])

In [None]:
assembler = VectorAssembler(
    inputCols=["Latitude", "Longitude", "Elevation", "month", "day_of_year"],
    outputCol="features"
)
assembled_df = assembler.transform(features_df)

In [None]:
model_df = assembled_df.select(col("features"), col("TMAX_actual").alias("label"))

In [None]:
(trainingData, testData) = model_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
gbt = GBTRegressor(featuresCol="features", labelCol="label", maxIter=10)
gbt_model = gbt.fit(trainingData)

In [1]:
# Decision Tree Regressor

In [2]:
from pyspark.sql.functions import col, to_date, dayofyear
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = SparkSession.builder \
    .appName("WeatherLinearRegression") \
    .master("local[*]") \
    .getOrCreate()

In [5]:
# Path to CSV in HDFS
original_csv_path = "hdfs://localhost:9000/weather_data//weatherbigdata.csv"

# Step 2: Load and Prepare Data
df = spark.read.csv(original_csv_path, header=True, inferSchema=True)

In [6]:
clean_df = df.na.drop(subset=["TMAX", "TMIN"])

# Step 2.2: Transform temperature to actual degrees Celsius and add a 'range' column
transformed_df = clean_df.withColumn("TMAX_actual", col("TMAX") / 10.0) \
                   .withColumn("TMIN_actual", col("TMIN") / 10.0) \
                   .withColumn("TRANGE", col("TMAX_actual") - col("TMIN_actual"))

# Step 2.3: Add a data quality filter to ensure TMAX is greater than or equal to TMIN
validated_df = transformed_df.filter(col("TMAX_actual") >= col("TMIN_actual"))

In [8]:
selected_df = validated_df.select(
    col("DATE"),
    col("Latitude").cast("double"),
    col("Longitude").cast("double"),
    col("Elevation").cast("double"),
    col("TMAX_actual").cast("double")
).na.drop()

In [12]:
featured_df = selected_df.withColumn("DayOfYear", dayofyear(to_date(col("DATE"), "M/d/yyyy")))
featured_df.show()

+----------+--------+---------+---------+-----------+---------+
|      DATE|Latitude|Longitude|Elevation|TMAX_actual|DayOfYear|
+----------+--------+---------+---------+-----------+---------+
|  3/7/2011| 64.2381|-145.2669|    463.3|       -2.8|       66|
| 9/26/2012| 41.9622| -84.9925|    299.9|       20.0|      270|
| 12/3/2000|    33.9| -80.5206|     76.2|        3.3|      338|
| 5/10/2007| 37.0539| -93.5756|    399.3|       26.7|      130|
|  8/2/2018| 45.3614| -84.9511|    228.0|       26.7|      214|
| 4/16/2008| 36.5869| -89.5325|     92.0|       15.0|      107|
| 7/11/2012| 42.6514|-111.5833|   1780.6|       35.6|      193|
|  1/5/1997| 38.3683| -78.2503|    175.9|       23.9|        5|
|  2/8/2021| 36.5728|  -79.335|    168.2|       10.6|       39|
| 2/19/1998| 44.4419|-100.4172|    506.9|        3.3|       50|
| 6/21/2012| 45.4539|-121.1303|    405.4|       31.1|      173|
|  7/3/1995| 46.3747|-102.3211|    739.4|       29.4|      184|
| 12/1/2002| 38.3794| -81.5911|    278.0

In [10]:
feature_columns = ["Latitude", "Longitude", "Elevation", "DayOfYear"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

final_df = assembler.transform(featured_df).select("features", col("TMAX_actual").alias("label"))

final_df.show(5, truncate=False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[64.2381,-145.2669,463.3,66.0]|-2.8 |
|[41.9622,-84.9925,299.9,270.0]|20.0 |
|[33.9,-80.5206,76.2,338.0]    |3.3  |
|[37.0539,-93.5756,399.3,130.0]|26.7 |
|[45.3614,-84.9511,228.0,214.0]|26.7 |
+------------------------------+-----+
only showing top 5 rows



In [13]:
(training_data, test_data) = final_df.randomSplit([0.8, 0.2], seed=42)

In [14]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label", maxDepth=10)
model = dt.fit(training_data)


In [15]:
predictions = model.transform(test_data)
predictions.select("label", "prediction").show(10)

+-----+------------------+
|label|        prediction|
+-----+------------------+
| 26.1|24.783049748825153|
| 25.0|24.783049748825153|
| 26.1|24.783049748825153|
| 26.7|24.588842315369263|
| 25.0|26.486188913962792|
| 25.6|26.486188913962792|
| 27.8| 29.16070818710386|
| 27.2| 29.16070818710386|
| 26.7|28.079597154023443|
| 26.7|28.079597154023443|
+-----+------------------+
only showing top 10 rows



In [16]:
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Decision Tree RMSE: {rmse}")
print(f"Decision Tree R2: {r2}")

Decision Tree RMSE: 7.037154260163144
Decision Tree R2: 0.6867188946792728


In [17]:
model_path = "hdfs://localhost:9000/models/dt_weather_model" # <-- New path
model.save(model_path)
print(f"Model saved to {model_path}")

Model saved to hdfs://localhost:9000/models/dt_weather_model
