# **Linear Regression – Gold price prediction**

## Import Libraries and Start PySpark Session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, window, to_date, monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pandas as pd

spark = SparkSession.builder.appName("GoldPricePrediction").getOrCreate()


## 1. Data Preprocessing

In [None]:
# 1. Load Data and Preprocessing
try:
    df = spark.read.csv(
        "gold_prices.csv",
        header=True,
        inferSchema=True
    )
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    spark.stop()
    exit()

# Rename columns to remove spaces
df = df.withColumnRenamed("Buy Price", "BuyPrice") \
       .withColumnRenamed("Sell Price", "SellPrice")

# Convert 'Date' column to date type
df = df.withColumn("Date", to_date(col("Date"), "yyyy/MM/dd"))

# Filter out rows with null dates if any, and sort by date
df = df.filter(col("Date").isNotNull()).orderBy("Date")

# For this task, we will use 'SellPrice' as the target gold price.
# Let's also ensure the prices are numeric (double or float).
df = df.withColumn("SellPrice", col("SellPrice").cast("double"))
df = df.withColumn("BuyPrice", col("BuyPrice").cast("double"))

# Drop rows with null prices if they exist after casting
df = df.dropna(subset=["SellPrice"])

# Add an index column to preserve order for window functions
df = df.withColumn("index", monotonically_increasing_id())


## 2. Generate Features and Label

In [None]:

# 2. Generate Features and Labels
# Define a window specification ordered by 'Date' and then 'index'
window_spec = Window.orderBy("Date", "index")

# Create lagged columns for the previous 10 days' SellPrice
# The label will be the current day's 'SellPrice'
df_features = df.withColumn("label", col("SellPrice"))

# Generate features: gold prices of 10 consecutive previous dates
feature_cols = []
for i in range(1, 11):
    feature_name = f"price_lag_{i}"
    # lag("SellPrice", i) means price from i days ago
    df_features = df_features.withColumn(feature_name, lag("SellPrice", i).over(window_spec))
    feature_cols.append(feature_name)

# Remove rows with null values in features (these will be the first 10 rows due to lag)
df_features = df_features.dropna(subset=feature_cols)

if df_features.count() == 0:
    print("No data available after feature engineering. This might be due to insufficient data points for the 10-day lag or issues with data.")
    df.show(15) # Show some initial data for debugging
    spark.stop()
    exit()


## 3. Prepare Data for Spark ML

In [None]:
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip") # Added handleInvalid
df_assembled = assembler.transform(df_features)

# Select only 'features' and 'label' columns
final_df = df_assembled.select("features", "label")

## 4. Split Data into Training and Test Sets


In [None]:
train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=42)

# Cache the dataframes for performance
train_df.cache()
test_df.cache()

if train_df.count() == 0 or test_df.count() == 0:
    print("Training or testing dataframe is empty after split. Check data and feature engineering steps.")
    print(f"Total samples before split: {final_df.count()}")
    print(f"Training samples: {train_df.count()}, Test samples: {test_df.count()}")
    spark.stop()
    exit()


## 5. Train Linear Regression Model


In [None]:
lr = LinearRegression(featuresCol="features", labelCol="label", solver="l-bfgs", maxIter=100) # maxIter can be adjusted
# Train the model
try:
    lr_model = lr.fit(train_df)
except Exception as e:
    print(f"Error during model training: {e}")
    if "requirement failed: Column features must be of type equal to one of the following types" in str(e):
        print("This error often means the 'features' column was not created correctly or has an unexpected data type.")
        print("Schema of train_df:")
        train_df.printSchema()
        print("Sample of train_df (first 5 rows):")
        train_df.show(5, truncate=False)
    elif "Nothing has been added to this summarizer." in str(e):
        print("This error often means the training dataframe is empty or has no valid features/labels.")
        print(f"Training data count: {train_df.count()}")
        train_df.show(5,truncate=False)
    spark.stop()
    exit()

## 6. Evaluate Model

In [None]:


# Make predictions on training and test sets
train_predictions = lr_model.transform(train_df)
test_predictions = lr_model.transform(test_df)

# Evaluator
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

# Evaluate on training data
train_rmse = evaluator_rmse.evaluate(train_predictions)
train_r2 = evaluator_r2.evaluate(train_predictions)

# Evaluate on test data
test_rmse = evaluator_rmse.evaluate(test_predictions)
test_r2 = evaluator_r2.evaluate(test_predictions)

print(f"\n--- Model Evaluation ---")
print(f"Training Set RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
print(f"Test Set RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")

# 7. Visualize Results

# a. Line chart for losses during the training process
training_summary = lr_model.summary
objective_history = training_summary.objectiveHistory

# Check if objective history is available and not empty
if objective_history:
    plt.figure(figsize=(10, 6))
    plt.plot(objective_history)
    plt.xlabel("Iteration")
    plt.ylabel("Objective Function (RMSE)") # For LinearRegression, objectiveHistory is RMSE on training data
    plt.title("Training Objective History (RMSE)")
    plt.grid(True)
    plt.savefig("training_loss_chart.png")
    # plt.show() # In a script, plt.show() might block execution. Saving is usually preferred.
    print("\nTraining loss chart (training_loss_chart.png) generated.")
else:
    print("\nObjective history not available or empty. Cannot plot training loss.")


# b. Bar chart to contrast results (RMSE) in training and test sets
metrics_data_pd = {
    'Set': ['Training', 'Test'],
    'RMSE': [train_rmse, test_rmse]
}
metrics_df_pd_df = pd.DataFrame(metrics_data_pd) # Renamed to avoid conflict

plt.figure(figsize=(8, 6))
bars = plt.bar(metrics_df_pd_df['Set'], metrics_df_pd_df['RMSE'], color=['skyblue', 'lightcoral'])
plt.xlabel("Data Set")
plt.ylabel("Root Mean Squared Error (RMSE)")
plt.title("RMSE Comparison: Training vs. Test Set")
plt.ylim(0, max(train_rmse, test_rmse) * 1.2 if max(train_rmse, test_rmse) > 0 else 1) # Adjust y-limit

# Add RMSE values on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01 * max(train_rmse, test_rmse, 1) , f'{yval:.2f}', ha='center', va='bottom')

plt.savefig("rmse_comparison_chart.png")
# plt.show()
print("RMSE comparison bar chart (rmse_comparison_chart.png) generated.")

# convert to Pandas for easier plotting (limit for performance)
sample_test_pd = test_predictions.select("label", "prediction").limit(100).toPandas()

if not sample_test_pd.empty:
    plt.figure(figsize=(12, 6))
    plt.plot(sample_test_pd.index, sample_test_pd['label'], label='Actual Prices', marker='o', linestyle='-')
    plt.plot(sample_test_pd.index, sample_test_pd['prediction'], label='Predicted Prices', marker='x', linestyle='--')
    plt.xlabel("Sample Index (from Test Set)")
    plt.ylabel("Gold Price")
    plt.title("Actual vs. Predicted Gold Prices (Sample from Test Set)")
    plt.legend()
    plt.grid(True)
    plt.savefig("actual_vs_predicted_test_sample.png")
    # plt.show()
    print("Actual vs. Predicted prices chart (actual_vs_predicted_test_sample.png) generated for a sample of the test set.")
else:
    print("Test predictions sample is empty, skipping actual vs. predicted plot.")

# Stop Spark Session
spark.stop()

print("\nProcess completed successfully.")
print(f"Please check the generated image files: training_loss_chart.png, rmse_comparison_chart.png, and actual_vs_predicted_test_sample.png (if data was sufficient).")