In [0]:
%pyspark
parquet_df = spark.read.parquet("/user/hive/warehouse/crypto_data1/part-*.parquet")

In [1]:
%pyspark
parquet_df.show()

In [2]:
%pyspark
parquet_df = parquet_df.drop("24H_VOLUME")
parquet_df = parquet_df.drop("24H_CHANGE")
parquet_df = parquet_df.drop("24H_CHANGE_CLEANED")
parquet_df = parquet_df.drop("Market_Cap")

In [3]:
%pyspark
parquet_df.show()

In [4]:
%pyspark
# Filter the DataFrame for 'BTC' in the 'Name' column
parquet_df = parquet_df.filter(col('Name') == 'BTC').orderBy(col('Datetime').desc()).limit(50)

In [5]:
%pyspark
parquet_df.show()

In [6]:
%pyspark
# Data Cleaning
# Convert Price to numeric
# Replace "$" and "," in the Price column, then cast to float
parquet_df = parquet_df.withColumn("Price", regexp_replace(col("Price"), "[\$,]", "").cast("float"))
parquet_df.show()

In [7]:
%pyspark
# Convert Datetime column to timestamp type
parquet_df = parquet_df.withColumn("Datetime", to_timestamp("Datetime"))

# Extract year, month, day, and hour
parquet_df = parquet_df.withColumn("Year", year("Datetime"))
parquet_df = parquet_df.withColumn("Month", month("Datetime"))
parquet_df = parquet_df.withColumn("Day", dayofmonth("Datetime"))
parquet_df = parquet_df.withColumn("Hour", hour("Datetime"))

In [8]:
%pyspark
parquet_df.show()

In [9]:
%pyspark
parquet_df = parquet_df.drop("Datetime")
parquet_df = parquet_df.drop("Name")

In [10]:
%pyspark
parquet_df.show()

In [11]:
%pyspark
# Assemble features into a vector
feature_columns = ['Year', 'Month', 'Day', 'Hour']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
assembled_data = assembler.transform(parquet_df)

In [12]:
%pyspark
from pyspark.ml import PipelineModel

In [13]:
%pyspark
loaded_model = PipelineModel.load("/user/root/Linear Regression_20231218003635")

In [14]:
%pyspark
loaded_model.transform(assembled_data).show()

In [15]:
%pyspark
predictions = loaded_model.transform(assembled_data)
evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE:", rmse)

In [16]:
%pyspark
import json
import os
from datetime import datetime

In [17]:
%pyspark
# Example model_info
model_info = {
    "rmse": rmse,
    "testing_date": str(datetime.now())
}

# Generate a unique filename based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
file_name = f"test_model_info_{timestamp}.json"  # Unique file name

# Specify the path where Hive table is located
hive_table_path = "/user/hive/warehouse/models_testing_infos_table"

# Path to save the JSON file temporarily (in the local filesystem)
local_file_path = f"/tmp/{file_name}"  # Change to your preferred temporary directory

# Save model information as JSON in the local filesystem
with open(local_file_path, "w") as json_file:
    json.dump(model_info, json_file)

# Move the generated file to the Hive table directory
os.system(f"hdfs dfs -put {local_file_path} {hive_table_path}/{file_name}")

# Remove the temporary local file after moving to HDFS
os.remove(local_file_path)