In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt

In [30]:
df = pd.read_csv('data/forex_predictions_data.csv')

In [31]:
# inspect
print(df.head())
print(df.info())
print(df.describe())
print(df.shape)

         Date     Open     High      Low    Close Volume  Predicted_Close  \
0  2024-01-01  1.18727  1.92461  0.85312  1.18154   2201          1.22984   
1  2024-01-02  1.47536  1.82881  0.54067  1.32296  error          1.03797   
2  2024-01-03  1.36600  1.78415  0.54242  1.28539   4420          1.03888   
3  2024-01-04  1.29933  1.54684  0.99332  1.17805   4079          1.00117   
4  2024-01-05  1.07801  1.68386  0.68714      NaN   1832          1.48385   

  Currency_Pair Signal  Confidence  
0       EUR/USD   Hold        0.90  
1       EUR/USD   Sell         NaN  
2       EUR/USD   Sell         NaN  
3       EUR/USD   Sell        0.64  
4       EUR/USD   Sell        0.68  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             226 non-null    object 
 1   Open             224 non-null    float64
 2   High             220 

In [32]:
#show nulls
print(df.isnull().sum())

Date                3
Open                5
High                9
Low                 4
Close              17
Volume              3
Predicted_Close     7
Currency_Pair       0
Signal              2
Confidence         11
dtype: int64


In [33]:
# turn numeric data to NaN
cols = df.select_dtypes(include=['float64', 'int64']).columns
cols = list(cols) + ['Volume']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')


for col in cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)
        
#show nulls again
print(df.isnull().sum())

Date               3
Open               0
High               0
Low                0
Close              0
Volume             0
Predicted_Close    0
Currency_Pair      0
Signal             2
Confidence         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [34]:
# handle categorical data
cat_cols = ['Signal','Currency_Pair']
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        # fill NaN with mode
        df[col] = df[col].fillna(df[col].mode()[0])  # fill NaN with mode
    
#show nulls again
print(df.isnull().sum())

Date               3
Open               0
High               0
Low                0
Close              0
Volume             0
Predicted_Close    0
Currency_Pair      0
Signal             0
Confidence         0
dtype: int64


In [35]:
# coerce date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
print(df.isnull().sum())

Date               3
Open               0
High               0
Low                0
Close              0
Volume             0
Predicted_Close    0
Currency_Pair      0
Signal             0
Confidence         0
dtype: int64


In [37]:
print (df.duplicated(subset=['Date']).sum())

12
