# 1. Data Loading and Preprocessing



In [32]:
import pandas as pd

# Load the data
df = pd.read_csv("data/raw/gold_data.csv")

# Check the first few rows to understand the structure
df.head()

# Drop the first two non-numeric rows (if they exist)
df = df.iloc[2:].reset_index(drop=True)

# Convert necessary columns to numeric values and handle errors (invalid parsing will turn to NaN)
df['Open'] = pd.to_numeric(df['Open'], errors='coerce')
df['High'] = pd.to_numeric(df['High'], errors='coerce')
df['Low'] = pd.to_numeric(df['Low'], errors='coerce')
df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

# Drop rows with NaN values (if any exist)
df = df.dropna()

# Verify the cleaned data
df.head()


Unnamed: 0,Price,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04,1117.699951171875,1117.699951,1122.300049,1097.099976,1117.699951,184
1,2010-01-05,1118.0999755859375,1118.099976,1126.5,1115.0,1118.099976,53
2,2010-01-06,1135.9000244140625,1135.900024,1139.199951,1120.699951,1135.900024,363
3,2010-01-07,1133.0999755859375,1133.099976,1133.099976,1129.199951,1133.099976,56
4,2010-01-08,1138.199951171875,1138.199951,1138.199951,1122.699951,1138.199951,54


# 2. Feature and Target Definition


In [33]:
# Define target variable (Close price)
y = df['Close']

# Define feature variables
gold_features = ['Open', 'High', 'Low', 'Volume']
X = df[gold_features]


# 3. Model Training

In [34]:
df.dtypes


Price         object
Adj Close     object
Close        float64
High         float64
Low          float64
Open         float64
Volume         int64
dtype: object

In [35]:
from sklearn.linear_model import LinearRegression

# Initialize the model
gold_model = LinearRegression()

# Train the model
gold_model.fit(X, y)


# 4. Model Evaluation


## 4.1 First 5 Entries Evaluation

In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Make predictions on the first 5 entries
predicted_adj_close = gold_model.predict(X.head())

# Calculate MAE and RMSE for first 5 entries
mae_first_5 = mean_absolute_error(y.head(), predicted_adj_close)
rmse_first_5 = np.sqrt(mean_squared_error(y.head(), predicted_adj_close))

print("MAE for first 5 entries:", mae_first_5)
print("RMSE for first 5 entries:", rmse_first_5)


MAE for first 5 entries: 8.073132837480761
RMSE for first 5 entries: 8.988455035734093


# 4.2 Evaluate on Entire Dataset

In [37]:
# Make predictions on the entire dataset
predicted_adj_close_all = gold_model.predict(X)

# Calculate MAE and RMSE for entire dataset
mae_all = mean_absolute_error(y, predicted_adj_close_all)
rmse_all = np.sqrt(mean_squared_error(y, predicted_adj_close_all))

print("MAE for entire dataset:", mae_all)
print("RMSE for entire dataset:", rmse_all)


MAE for entire dataset: 3.6472133802565736
RMSE for entire dataset: 5.4037224426954955


# 5. Train-Validation Split


In [38]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model on the training set
gold_model.fit(train_X, train_y)


#6. Validation Set Evaluation


In [39]:
# Make predictions on the validation set
val_predictions = gold_model.predict(val_X)

# Calculate MAE and RMSE for validation set
mae_val = mean_absolute_error(val_y, val_predictions)
rmse_val = np.sqrt(mean_squared_error(val_y, val_predictions))

print("MAE on validation set:", mae_val)
print("RMSE on validation set:", rmse_val)


MAE on validation set: 3.728191898970969
RMSE on validation set: 5.7119824707801925


7. Save the Model

In [40]:
import joblib

# Save the trained model
joblib.dump(gold_model, 'gold_price_model.pkl')
print("Model saved successfully.")


Model saved successfully.


8. Model Prediction

In [41]:
# Load the saved model
model = joblib.load('gold_price_model.pkl')

# Make a prediction using sample data
sample_data = val_X.iloc[0].to_dict()
sample_df = pd.DataFrame([sample_data])

# Make a prediction
prediction = model.predict(sample_df)

print(f"Sample data: {sample_data}")
print(f"Predicted gold price: {prediction[0]}")


Sample data: {'Open': 1282.800048828125, 'High': 1284.0, 'Low': 1269.199951171875, 'Volume': 86.0}
Predicted gold price: 1273.3618030914886
