In [7]:
import pandas as pd
import numpy as np
import pandasql as psql

# Load the dataset
file_path = '../data/raw/indexData.csv'
df = pd.read_csv(file_path)



query = """
SELECT * 
FROM df
WHERE Open IS NOT NULL
AND Close IS NOT NULL
AND Volume IS NOT NULL
"""

# Execute the query
df_cleaned = psql.sqldf(query, locals())


# 1. Drop rows with any missing (NaN) values
#df_cleaned = df.dropna()

# Define thresholds for outliers based on the 1st and 99th percentiles for each feature

volume_threshold = (df_cleaned['Volume'].quantile(0.01), df_cleaned['Volume'].quantile(0.99))
open_threshold = (df_cleaned['Open'].quantile(0.01), df_cleaned['Open'].quantile(0.99))
high_threshold = (df_cleaned['High'].quantile(0.01), df_cleaned['High'].quantile(0.99))
low_threshold = (df_cleaned['Low'].quantile(0.01), df_cleaned['Low'].quantile(0.99))
close_threshold = (df_cleaned['Close'].quantile(0.01), df_cleaned['Close'].quantile(0.99))
adj_close_threshold = (df_cleaned['Adj Close'].quantile(0.01), df_cleaned['Adj Close'].quantile(0.99))

# Remove rows where 'Volume', 'Open', 'High', 'Low', 'Close', or 'Adj Close' fall outside of these thresholds
df_cleaned = df_cleaned[
    (df_cleaned['Volume'] >= volume_threshold[0]) & (df_cleaned['Volume'] <= volume_threshold[1]) &
    (df_cleaned['Open'] >= open_threshold[0]) & (df_cleaned['Open'] <= open_threshold[1]) &
    (df_cleaned['High'] >= high_threshold[0]) & (df_cleaned['High'] <= high_threshold[1]) &
    (df_cleaned['Low'] >= low_threshold[0]) & (df_cleaned['Low'] <= low_threshold[1]) &
    (df_cleaned['Close'] >= close_threshold[0]) & (df_cleaned['Close'] <= close_threshold[1]) &
    (df_cleaned['Adj Close'] >= adj_close_threshold[0]) & (df_cleaned['Adj Close'] <= adj_close_threshold[1])
]

# 3. Remove Duplicates (if any)
df_cleaned = df_cleaned.drop_duplicates()

# 4. (Optional) Filter rows based on a specific date range (assuming there's a 'Date' column)
# Convert 'Date' column to datetime if needed
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])

# 5. Example of creating a new feature (similar to `CloseUSD`)
# Let's assume we are converting 'Close' price to USD using a fixed exchange rate
usd_conversion_rate = 0.85  # Example conversion rate
df_cleaned['CloseUSD'] = df_cleaned['Close'] * usd_conversion_rate

# Show the result
print(df_cleaned.head())
num_rows = len(df_cleaned)
print(f"Number of rows: {num_rows}")
# Save the cleaned dataset if needed
df_cleaned.to_csv('../data/processed/indexData_processed.csv', index=False)


  Index       Date        Open        High         Low       Close  \
0   NYA 1965-12-31  528.690002  528.690002  528.690002  528.690002   
1   NYA 1966-01-03  527.210022  527.210022  527.210022  527.210022   
2   NYA 1966-01-04  527.840027  527.840027  527.840027  527.840027   
3   NYA 1966-01-05  531.119995  531.119995  531.119995  531.119995   
4   NYA 1966-01-06  532.070007  532.070007  532.070007  532.070007   

    Adj Close  Volume    CloseUSD  
0  528.690002     0.0  449.386502  
1  527.210022     0.0  448.128519  
2  527.840027     0.0  448.664023  
3  531.119995     0.0  451.451996  
4  532.070007     0.0  452.259506  
Number of rows: 106899


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load cleaned data
file_path = '../data/processed/indexData_processed.csv'
df_cleaned = pd.read_csv(file_path)

# Feature Engineering: Create target variable (1 for increase, 0 for decrease)
df_cleaned['Next_Close'] = df_cleaned['Close'].shift(-1)  # Shift the 'Close' column to compare with next day
df_cleaned['Target'] = (df_cleaned['Next_Close'] > df_cleaned['Close']).astype(int)

# Drop the last row as it will have a NaN 'Next_Close'
df_cleaned = df_cleaned.dropna()

# Features (X) and target (y)
features = ['Open', 'High', 'Low', 'Adj Close', 'Volume']
X = df_cleaned[features]
y = df_cleaned['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.47      0.45      0.46     10042
           1       0.53      0.56      0.54     11338

    accuracy                           0.50     21380
   macro avg       0.50      0.50      0.50     21380
weighted avg       0.50      0.50      0.50     21380



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load and clean your data (assuming it's already cleaned)
file_path = '../data/processed/indexData_processed.csv'

# Feature Selection: Choose features that are relevant for prediction
X = df_cleaned[['Open', 'High', 'Low', 'Adj Close', 'Volume']]
y = df_cleaned['Target']  # This should be the column indicating increase or decrease

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression Classifier
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.48      0.01      0.01     10042
           1       0.53      0.99      0.69     11338

    accuracy                           0.53     21380
   macro avg       0.51      0.50      0.35     21380
weighted avg       0.51      0.53      0.37     21380

