# 1. Data Loading and Preprocessing



In [49]:
import pandas as pd

# Load the dataset and skip the problematic row
df = pd.read_csv("../data/gold_data.csv", skiprows=1) 
df.head()


Unnamed: 0,Ticker,GC=F,GC=F.1,GC=F.2,GC=F.3,GC=F.4,GC=F.5
0,Date,,,,,,
1,2010-01-04,1117.699951,1117.699951,1122.300049,1097.099976,1117.699951,184.0
2,2010-01-05,1118.099976,1118.099976,1126.5,1115.0,1118.099976,53.0
3,2010-01-06,1135.900024,1135.900024,1139.199951,1120.699951,1135.900024,363.0
4,2010-01-07,1133.099976,1133.099976,1133.099976,1129.199951,1133.099976,56.0


In [56]:
from sklearn.model_selection import train_test_split
 

# 2. Rename the columns
df.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

# 3. Drop rows where 'Date' is invalid or NaN
df = df.dropna(subset=['Date'])

# 4. Convert 'Date' to datetime format explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')

# 5. Drop any rows where 'Date' could not be parsed
df = df.dropna(subset=['Date'])

# 6. Convert numeric columns to numeric type
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 7. Drop rows with missing numeric values
df = df.dropna()
df.head()



Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1,2010-01-04,1117.699951,1117.699951,1122.300049,1097.099976,1117.699951,184.0
2,2010-01-05,1118.099976,1118.099976,1126.5,1115.0,1118.099976,53.0
3,2010-01-06,1135.900024,1135.900024,1139.199951,1120.699951,1135.900024,363.0
4,2010-01-07,1133.099976,1133.099976,1133.099976,1129.199951,1133.099976,56.0
5,2010-01-08,1138.199951,1138.199951,1138.199951,1122.699951,1138.199951,54.0


In [57]:
# 8. Split the data into features (X) and target (y)
X = df[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]  # Features
y = df['Close']  # Target variable (example: predicting 'Close' prices)

# 9. Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 10. Save the training and test sets
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_path = '../data/gold_data_train.csv'
test_path = '../data/gold_data_test.csv'

train_data.to_csv(train_path, index=False)
test_data.to_csv(test_path, index=False)

# 11. Print confirmation messages
print(f"Les données d'entraînement ont été sauvegardées dans : {train_path}")
print(f"Les données de test ont été sauvegardées dans : {test_path}")



Les données d'entraînement ont été sauvegardées dans : ../data/gold_data_train.csv
Les données de test ont été sauvegardées dans : ../data/gold_data_test.csv
