# **Gold Price Prediction (2013-2023)**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Camerin DSML/Gold Price (2013-2023).csv')
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,12/30/2022,1826.2,1821.8,1832.4,1819.8,107.50K,0.01%
1,12/29/2022,1826.0,1812.3,1827.3,1811.2,105.99K,0.56%
2,12/28/2022,1815.8,1822.4,1822.8,1804.2,118.08K,-0.40%
3,12/27/2022,1823.1,1808.2,1841.9,1808.0,159.62K,0.74%
4,12/26/2022,1809.7,1805.8,1811.95,1805.55,,0.30%


**Preprocessing**

In [None]:
df.shape

(2583, 7)

In [None]:
df.isna().sum()

Unnamed: 0,0
Date,0
Price,0
Open,0
High,0
Low,0
Vol.,5
Change %,0


In [None]:
df=df.dropna()
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,12/30/2022,1826.2,1821.8,1832.4,1819.8,107.50K,0.01%
1,12/29/2022,1826.0,1812.3,1827.3,1811.2,105.99K,0.56%
2,12/28/2022,1815.8,1822.4,1822.8,1804.2,118.08K,-0.40%
3,12/27/2022,1823.1,1808.2,1841.9,1808.0,159.62K,0.74%
5,12/23/2022,1804.2,1801.0,1812.2,1798.9,105.46K,0.50%


In [None]:
df=df.drop(columns='Change %')

In [None]:
df.isna().sum()

Unnamed: 0,0
Date,0
Price,0
Open,0
High,0
Low,0
Vol.,0


In [None]:
df.dtypes

Unnamed: 0,0
Date,object
Price,object
Open,object
High,object
Low,object
Vol.,object


In [None]:
#converting price values from object to float
cols=['Price','Open','High','Low']

for c in cols:
  df[c]=(
      df[c].astype(str).str.replace(',',"").str.replace('$',"")
  )
  df[c]=pd.to_numeric(df[c],errors='coerce')

In [None]:
#converting volumn from object eg: 100k to corresponding numeric value 100000
def parse_volume(x):
    x=str(x).strip().upper()
    if x.endswith('K'):
        return float(x[:-1]) * 1000
    elif x.endswith('M'):
        return float(x[:-1]) * 1_000_000
    elif x.endswith('B'):
        return float(x[:-1]) * 1_000_000_000
    else:
        return float(x)   # if no suffix

In [None]:
df['Vol.']=df['Vol.'].apply(parse_volume)

In [None]:
#converting date to seconds
df['Date']=pd.to_datetime(df['Date'])
df['Date (Seconds)']=df['Date'].astype('int64') // 10**9   # convert to seconds

In [None]:
df=df.drop(columns='Date')

In [None]:
df.head()

Unnamed: 0,Price,Open,High,Low,Vol.,Date (Seconds)
0,1826.2,1821.8,1832.4,1819.8,107500.0,1672358400
1,1826.0,1812.3,1827.3,1811.2,105990.0,1672272000
2,1815.8,1822.4,1822.8,1804.2,118080.0,1672185600
3,1823.1,1808.2,1841.9,1808.0,159620.0,1672099200
5,1804.2,1801.0,1812.2,1798.9,105460.0,1671753600


In [None]:
df.dtypes

Unnamed: 0,0
Price,float64
Open,float64
High,float64
Low,float64
Vol.,float64
Date (Seconds),int64


In [None]:
x=df.drop(columns='Price')
x.head()

Unnamed: 0,Open,High,Low,Vol.,Date (Seconds)
0,1821.8,1832.4,1819.8,107500.0,1672358400
1,1812.3,1827.3,1811.2,105990.0,1672272000
2,1822.4,1822.8,1804.2,118080.0,1672185600
3,1808.2,1841.9,1808.0,159620.0,1672099200
5,1801.0,1812.2,1798.9,105460.0,1671753600


In [None]:
y=df['Price']
y.head()

Unnamed: 0,Price
0,1826.2
1,1826.0
2,1815.8
3,1823.1
5,1804.2


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=11)

In [None]:
scale=StandardScaler()
x_train=scale.fit_transform(x_train)
x_test=scale.fit_transform(x_test)

**Model Selection**

In [None]:
LR_score=cross_val_score(LinearRegression(),X=x,y=y,cv=5)
DT_score=cross_val_score(DecisionTreeRegressor(),X=x,y=y,cv=5)
RF_score=cross_val_score(RandomForestRegressor(),X=x,y=y,cv=5)
KN_score=cross_val_score(KNeighborsRegressor(),X=x,y=y,cv=5)
AD_score=cross_val_score(AdaBoostRegressor(),X=x,y=y,cv=5)
GB_score=cross_val_score(GradientBoostingRegressor(),X=x,y=y,cv=5)
XG_score=cross_val_score(XGBRegressor(),X=x,y=y,cv=5)
svm_score=cross_val_score(SVR(),X=x,y=y,cv=5)

In [None]:
print(f'Linear Regression : {np.round(np.mean(LR_score)*100,2)}%')
print(f'Decision Tree : {np.round(np.mean(DT_score)*100,2)}%')
print(f'Random Forest : {np.round(np.mean(RF_score)*100,2)}%')
print(f'KNN : {np.round(np.mean(KN_score)*100,2)}%')
print(f'Adaboost : {np.round(np.mean(AD_score)*100,2)}%')
print(f'Gradient Boost : {np.round(np.mean(GB_score)*100,2)}%')
print(f'XGB : {np.round(np.mean(XG_score)*100,2)}%')
print(f'SVM : {np.round(np.mean(svm_score)*100,2)}%')

Linear Regression : 99.61%
Decision Tree : 95.64%
Random Forest : 96.45%
KNN : -115.07%
Adaboost : 90.24%
Gradient Boost : 96.0%
XGB : 95.0%
SVM : -1250.06%


* Random Forest : 96.45%
* Decision Tree : 95.64%
* Gradient Boost : 96.0%
* XGB : 95.0%
* Adaboost : 90.24%

**Model Evaluation**

In [None]:
rf=RandomForestRegressor()
forest_params=[{'max_depth':list(range(10,15)),'max_features':list(range(0,15))}]

In [None]:
clf=GridSearchCV(rf,forest_params,cv=5,scoring='r2')
clf.fit(x_train,y_train)
print(clf.best_params_)
print(clf.best_score_)

25 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

{'max_depth': 11, 'max_features': 4}
0.9991481065367436


* {'max_depth': 11, 'max_features': 4}
* 0.9991481065367436

In [None]:
gb=GradientBoostingRegressor()
gb_params=[{'max_depth':list(range(2,7)),'learning_rate':[i/100 for i in range(1,21)]}] #(0.01-0.2)

In [None]:
clf=GridSearchCV(gb,gb_params,cv=5,scoring='r2')
clf.fit(x_train,y_train)
print(clf.best_params_)
print(clf.best_score_)

{'learning_rate': 0.18, 'max_depth': 5}
0.9991704113134734


* {'learning_rate': 0.18, 'max_depth': 5}
* 0.999170411313473

In [None]:
xg=XGBRegressor()
xg_params=[{'colsample_bytree':[i/10 for i in range(4,11)],'gamma':list(range(0,10))}]

In [None]:
clf=GridSearchCV(xg,xg_params,scoring='r2')
clf.fit(x_train,y_train)
print(clf.best_params_)
print(clf.best_score_)

{'colsample_bytree': 1.0, 'gamma': 2}
0.9990303564849403


* {'colsample_bytree': 1.0, 'gamma': 2}
* 0.999030356484940

As most of the model's accuracy is close to the perfect score we choose Adaboost

**Model Fixed**

In [None]:
model=AdaBoostRegressor()
model.fit(x_train,y_train)

pred_y=model.predict(x_test)
pred_y

array([1320.72382671, 1317.70165563, 1284.20724638, 1362.15090909,
       1262.43642173, 1334.49435028, 1674.52055556, 1883.79446809,
       1334.49435028, 1197.42870159, 1308.95888502, 1324.51983471,
       1447.82237443, 1253.52633136, 1367.36149425, 1362.70222841,
       1231.3399635 , 1236.07463127, 1780.95318182, 1362.15090909,
       1201.75957944, 1760.76747967, 1280.3225256 , 1362.15090909,
       1674.52055556, 1284.20724638, 1499.65164835, 1928.42750809,
       1939.65671642, 1766.92610619, 1801.36969697, 1407.89420849,
       1498.72118644, 1284.20724638, 1668.4583691 , 1231.3399635 ,
       1843.40557769, 1231.3399635 , 1608.97391304, 1324.51983471,
       1407.89420849, 1248.87622378, 1129.3997555 , 1324.51983471,
       1231.3399635 , 1317.70165563, 1883.79446809, 1224.03157895,
       1218.68058252, 1711.65074627, 1939.65671642, 1317.70165563,
       1201.75957944, 1231.3399635 , 1329.95498891, 1934.64555556,
       1306.48235294, 1248.87622378, 1363.38516129, 1237.84233

In [None]:
a=model.score(x_test,y_test)
print(f'Accuracy : {np.round((a*100),2)}%')

Accuracy : 99.7%


**Deployment**

In [None]:
from datetime import datetime

Open=float(input("Enter the open price: "))
High=float(input("Enter the highest price: "))
Low=float(input("Enter the lowest price: "))
Vol=float(input("Enter the volume: "))
date_str=input("Enter the date (YYYY-MM-DD): ")

# Parse the string into a datetime object
Date=datetime.strptime(date_str, "%Y-%m-%d")
Date=Date.timestamp()

Enter the open price: 85000
Enter the highest price: 90000
Enter the lowest price: 75000
Enter the volume: 120
Enter the date (YYYY-MM-DD): 2025-11-26


In [None]:
final={
    "Open":[Open],
    "High":[High],
    "Low":[Low],
    "Vol.":[Vol],
    "Date (Seconds)":[Date]
}
data=pd.DataFrame(final,columns=x.columns)
data_scaled=scale.transform(data)

In [None]:
a=model.predict(data_scaled)
print(f'Price will be : {a}Rs')

Price will be : Rs[2004.93534483]
