In [2]:
import pandas as pd
import numpy as np
import pandasql as psql

# Load the dataset
file_path = '../data/raw/indexData.csv'
df = pd.read_csv(file_path)



query = """
SELECT * 
FROM df
WHERE Open IS NOT NULL
AND Close IS NOT NULL
AND Volume IS NOT NULL
"""

# Execute the query
df_cleaned = psql.sqldf(query, locals())


# 1. Drop rows with any missing (NaN) values
#df_cleaned = df.dropna()

# Define thresholds for outliers based on the 1st and 99th percentiles for each feature

volume_threshold = (df_cleaned['Volume'].quantile(0.01), df_cleaned['Volume'].quantile(0.99))
open_threshold = (df_cleaned['Open'].quantile(0.01), df_cleaned['Open'].quantile(0.99))
high_threshold = (df_cleaned['High'].quantile(0.01), df_cleaned['High'].quantile(0.99))
low_threshold = (df_cleaned['Low'].quantile(0.01), df_cleaned['Low'].quantile(0.99))
close_threshold = (df_cleaned['Close'].quantile(0.01), df_cleaned['Close'].quantile(0.99))
adj_close_threshold = (df_cleaned['Adj Close'].quantile(0.01), df_cleaned['Adj Close'].quantile(0.99))

# Remove rows where 'Volume', 'Open', 'High', 'Low', 'Close', or 'Adj Close' fall outside of these thresholds
df_cleaned = df_cleaned[
    (df_cleaned['Volume'] >= volume_threshold[0]) & (df_cleaned['Volume'] <= volume_threshold[1]) &
    (df_cleaned['Open'] >= open_threshold[0]) & (df_cleaned['Open'] <= open_threshold[1]) &
    (df_cleaned['High'] >= high_threshold[0]) & (df_cleaned['High'] <= high_threshold[1]) &
    (df_cleaned['Low'] >= low_threshold[0]) & (df_cleaned['Low'] <= low_threshold[1]) &
    (df_cleaned['Close'] >= close_threshold[0]) & (df_cleaned['Close'] <= close_threshold[1]) &
    (df_cleaned['Adj Close'] >= adj_close_threshold[0]) & (df_cleaned['Adj Close'] <= adj_close_threshold[1])
]

# 3. Remove Duplicates (if any)
df_cleaned = df_cleaned.drop_duplicates()

# 4. (Optional) Filter rows based on a specific date range (assuming there's a 'Date' column)
# Convert 'Date' column to datetime if needed
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])

# 5. Example of creating a new feature (similar to `CloseUSD`)
# Let's assume we are converting 'Close' price to USD using a fixed exchange rate
usd_conversion_rate = 0.85  # Example conversion rate
df_cleaned['CloseUSD'] = df_cleaned['Close'] * usd_conversion_rate



# Create daily price difference
df_cleaned['price_diff'] = df_cleaned['Close'] - df_cleaned['Open']

# Calculate moving averages for closing prices (5-day, 10-day, 30-day)
df_cleaned['ma_5'] = df_cleaned['Close'].rolling(window=5).mean()
df_cleaned['ma_10'] = df_cleaned['Close'].rolling(window=10).mean()
df_cleaned['ma_30'] = df_cleaned['Close'].rolling(window=30).mean()

# Calculate daily volatility using high and low prices
df_cleaned['volatility'] = df_cleaned['High'] - df_cleaned['Low']

# Calculate percentage returns
df_cleaned['pct_return'] = df_cleaned['Close'].pct_change()

# Drop the first row due to NaN values created by pct_change
data = df_cleaned.dropna()

# Create a target variable: 1 if next day's closing price increases, 0 otherwise
data['price_direction'] = (data['Close'].shift(-1) > data['Close']).astype(int)

# Drop the last row since it won't have a target value
#data = data[:-1]

# Display the new features
print(data[['price_diff', 'ma_5', 'ma_10', 'ma_30', 'volatility', 'pct_return', 'price_direction']].head())

# Show the result
print(df_cleaned.head())
num_rows = len(df_cleaned)
print(f"Number of rows: {num_rows}")
# Save the cleaned dataset if needed
df_cleaned.to_csv('../data/processed/indexData_processed.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price_direction'] = (data['Close'].shift(-1) > data['Close']).astype(int)


    price_diff        ma_5       ma_10       ma_30  volatility  pct_return  \
29         0.0  537.566003  534.946002  534.558667         0.0   -0.002741   
30         0.0  538.138001  535.210004  534.879333         0.0   -0.000186   
31         0.0  538.244006  535.654004  535.231667         0.0   -0.000985   
32         0.0  537.968005  536.288007  535.485333         0.0   -0.004333   
33         0.0  537.059998  536.679004  535.626333         0.0   -0.000187   

    price_direction  
29                0  
30                0  
31                0  
32                0  
33                0  
  Index       Date        Open        High         Low       Close  \
0   NYA 1965-12-31  528.690002  528.690002  528.690002  528.690002   
1   NYA 1966-01-03  527.210022  527.210022  527.210022  527.210022   
2   NYA 1966-01-04  527.840027  527.840027  527.840027  527.840027   
3   NYA 1966-01-05  531.119995  531.119995  531.119995  531.119995   
4   NYA 1966-01-06  532.070007  532.070007  532.0