#  Data Loading and Preprocessing



In [23]:
import pandas as pd

# Load the dataset and skip the problematic row
df = pd.read_csv("../data/gold_data.csv", skiprows=1) 
df.head()


Unnamed: 0,Ticker,GC=F,GC=F.1,GC=F.2,GC=F.3,GC=F.4,GC=F.5
0,Date,,,,,,
1,2010-01-04,1117.699951,1117.699951,1122.300049,1097.099976,1117.699951,184.0
2,2010-01-05,1118.099976,1118.099976,1126.5,1115.0,1118.099976,53.0
3,2010-01-06,1135.900024,1135.900024,1139.199951,1120.699951,1135.900024,363.0
4,2010-01-07,1133.099976,1133.099976,1133.099976,1129.199951,1133.099976,56.0


## 1. Rename the columns

In [24]:
from sklearn.model_selection import train_test_split


df.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,Date,,,,,,
1,2010-01-04,1117.699951,1117.699951,1122.300049,1097.099976,1117.699951,184.0
2,2010-01-05,1118.099976,1118.099976,1126.5,1115.0,1118.099976,53.0
3,2010-01-06,1135.900024,1135.900024,1139.199951,1120.699951,1135.900024,363.0
4,2010-01-07,1133.099976,1133.099976,1133.099976,1129.199951,1133.099976,56.0


## 2. Drop rows where 'Date' is invalid or NaN

In [25]:

df = df.dropna(subset=['Date'])


## 3. Convert 'Date' to datetime format explicitly

In [26]:

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')

## 4. Drop any rows where 'Date' could not be parsed

In [27]:
df = df.dropna(subset=['Date'])


## 5. Convert numeric columns to numeric type


In [28]:
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')


## 6. Drop rows with missing numeric values

In [29]:

df = df.dropna()


## 7. Save the cleaned dataset

In [30]:
clean_data_path = '../data/gold_data_clean.csv'
df.to_csv(clean_data_path, index=False)

## 8. Print confirmation message


In [31]:
print(f"Les données nettoyées ont été sauvegardées dans : {clean_data_path}")


Les données nettoyées ont été sauvegardées dans : ../data/gold_data_clean.csv
