In [7]:
import pandas as pd
import numpy as np
import pandasql as psql

# Load the dataset
file_path = '../data/raw/indexData.csv'
df = pd.read_csv(file_path)



query = """
SELECT * 
FROM df
WHERE Open IS NOT NULL
AND Close IS NOT NULL
AND Volume IS NOT NULL
"""

# Execute the query
df_cleaned = psql.sqldf(query, locals())


# 1. Drop rows with any missing (NaN) values
#df_cleaned = df.dropna()

# Define thresholds for outliers based on the 1st and 99th percentiles for each feature

volume_threshold = (df_cleaned['Volume'].quantile(0.01), df_cleaned['Volume'].quantile(0.99))
open_threshold = (df_cleaned['Open'].quantile(0.01), df_cleaned['Open'].quantile(0.99))
high_threshold = (df_cleaned['High'].quantile(0.01), df_cleaned['High'].quantile(0.99))
low_threshold = (df_cleaned['Low'].quantile(0.01), df_cleaned['Low'].quantile(0.99))
close_threshold = (df_cleaned['Close'].quantile(0.01), df_cleaned['Close'].quantile(0.99))
adj_close_threshold = (df_cleaned['Adj Close'].quantile(0.01), df_cleaned['Adj Close'].quantile(0.99))

# Remove rows where 'Volume', 'Open', 'High', 'Low', 'Close', or 'Adj Close' fall outside of these thresholds
df_cleaned = df_cleaned[
    (df_cleaned['Volume'] >= volume_threshold[0]) & (df_cleaned['Volume'] <= volume_threshold[1]) &
    (df_cleaned['Open'] >= open_threshold[0]) & (df_cleaned['Open'] <= open_threshold[1]) &
    (df_cleaned['High'] >= high_threshold[0]) & (df_cleaned['High'] <= high_threshold[1]) &
    (df_cleaned['Low'] >= low_threshold[0]) & (df_cleaned['Low'] <= low_threshold[1]) &
    (df_cleaned['Close'] >= close_threshold[0]) & (df_cleaned['Close'] <= close_threshold[1]) &
    (df_cleaned['Adj Close'] >= adj_close_threshold[0]) & (df_cleaned['Adj Close'] <= adj_close_threshold[1])
]

# 3. Remove Duplicates (if any)
df_cleaned = df_cleaned.drop_duplicates()

# 4. (Optional) Filter rows based on a specific date range (assuming there's a 'Date' column)
# Convert 'Date' column to datetime if needed
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])

# 5. Example of creating a new feature (similar to `CloseUSD`)
# Let's assume we are converting 'Close' price to USD using a fixed exchange rate
usd_conversion_rate = 0.85  # Example conversion rate
df_cleaned['CloseUSD'] = df_cleaned['Close'] * usd_conversion_rate

# Show the result
print(df_cleaned.head())
num_rows = len(df_cleaned)
print(f"Number of rows: {num_rows}")
# Save the cleaned dataset if needed
df_cleaned.to_csv('../data/processed/indexData_processed.csv', index=False)


  Index       Date        Open        High         Low       Close  \
0   NYA 1965-12-31  528.690002  528.690002  528.690002  528.690002   
1   NYA 1966-01-03  527.210022  527.210022  527.210022  527.210022   
2   NYA 1966-01-04  527.840027  527.840027  527.840027  527.840027   
3   NYA 1966-01-05  531.119995  531.119995  531.119995  531.119995   
4   NYA 1966-01-06  532.070007  532.070007  532.070007  532.070007   

    Adj Close  Volume    CloseUSD  
0  528.690002     0.0  449.386502  
1  527.210022     0.0  448.128519  
2  527.840027     0.0  448.664023  
3  531.119995     0.0  451.451996  
4  532.070007     0.0  452.259506  
Number of rows: 106899


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load cleaned data
file_path = '../data/processed/indexData_processed.csv'
df_cleaned = pd.read_csv(file_path)

# Feature Engineering: Create target variable (1 for increase, 0 for decrease)
df_cleaned['Next_Close'] = df_cleaned['Close'].shift(-1)  # Shift the 'Close' column to compare with next day
df_cleaned['Target'] = (df_cleaned['Next_Close'] > df_cleaned['Close']).astype(int)

# Drop the last row as it will have a NaN 'Next_Close'
df_cleaned = df_cleaned.dropna()

# Features (X) and target (y)
features = ['Open', 'High', 'Low', 'Adj Close', 'Volume']
X = df_cleaned[features]
y = df_cleaned['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.47      0.45      0.46     10042
           1       0.53      0.56      0.54     11338

    accuracy                           0.50     21380
   macro avg       0.50      0.50      0.50     21380
weighted avg       0.50      0.50      0.50     21380



In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
data = pd.read_csv('../data/processed/indexData_processed.csv')
#data = pd.read_csv('../data/raw/indexProcessed.csv')

# Create a target variable: Price_Change (1 = Increase, 0 = Decrease)
data['Price_Change'] = (data['Close'].shift(-1) > data['Close']).astype(int)

# Drop the last row since it won't have a target value
data = data[:-1]

# Features for the model (without CloseUSD)
features = ['Open', 'High', 'Low', 'Adj Close', 'Volume']
X = data[features]
y = data['Price_Change']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_log))

print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.5293732460243218
Random Forest Accuracy: 0.5078110383536015

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     10062
           1       0.53      1.00      0.69     11318

    accuracy                           0.53     21380
   macro avg       0.26      0.50      0.35     21380
weighted avg       0.28      0.53      0.37     21380


Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.48      0.45      0.46     10062
           1       0.53      0.56      0.55     11318

    accuracy                           0.51     21380
   macro avg       0.50      0.50      0.50     21380
weighted avg       0.51      0.51      0.51     21380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Load the CSV data into a DataFrame
df_cleaned = pd.read_csv('../data/processed/indexData_processed.csv')

# 1. Feature Engineering - Create new features
df_cleaned['Price_Return'] = df_cleaned['Adj Close'].pct_change()
df_cleaned['5_day_MA'] = df_cleaned['Adj Close'].rolling(window=5).mean()
df_cleaned['10_day_MA'] = df_cleaned['Adj Close'].rolling(window=10).mean()

# Drop rows with missing values created by pct_change and rolling mean
df_cleaned = df_cleaned.dropna()

# Target: Whether 'Adj Close' will increase or decrease
df_cleaned['Target'] = (df_cleaned['Adj Close'].shift(-1) > df_cleaned['Adj Close']).astype(int)

# 2. Prepare data for modeling
X = df_cleaned[['Open', 'High', 'Low', 'Adj Close', 'Volume', 'Price_Return', '5_day_MA', '10_day_MA']]
y = df_cleaned['Target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Logistic Regression with Class Weight
log_reg = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
print("Logistic Regression:")
print(classification_report(y_test, y_pred_log_reg))

# 4. Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_pred_gb = gb_model.predict(X_test_scaled)
print("Gradient Boosting Classifier:")
print(classification_report(y_test, y_pred_gb))

# 5. Support Vector Classifier
svc = SVC(kernel='linear', class_weight='balanced', random_state=42)
svc.fit(X_train_scaled, y_train)
y_pred_svc = svc.predict(X_test_scaled)
print("Support Vector Classifier:")
print(classification_report(y_test, y_pred_svc))

# 6. Hyperparameter Tuning using GridSearchCV for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
best_log_reg = grid_search.best_estimator_
y_pred_best_log_reg = best_log_reg.predict(X_test_scaled)
print("Best Logistic Regression from Grid Search:")
print(classification_report(y_test, y_pred_best_log_reg))
print("Best Parameters: ", grid_search.best_params_)


TypeError: string indices must be integers

In [14]:
import pandas as pd

# Load the data
data = pd.read_csv('../data/processed/indexData_processed.csv')

# Create daily price difference
data['price_diff'] = data['Close'] - data['Open']

# Calculate moving averages for closing prices (5-day, 10-day, 30-day)
data['ma_5'] = data['Close'].rolling(window=5).mean()
data['ma_10'] = data['Close'].rolling(window=10).mean()
data['ma_30'] = data['Close'].rolling(window=30).mean()

# Calculate daily volatility using high and low prices
data['volatility'] = data['High'] - data['Low']

# Calculate percentage returns
data['pct_return'] = data['Close'].pct_change()

# Drop the first row due to NaN values created by pct_change
data = data.dropna()

# Create a target variable: 1 if next day's closing price increases, 0 otherwise
data['price_direction'] = (data['Close'].shift(-1) > data['Close']).astype(int)

# Drop the last row since it won't have a target value
data = data[:-1]

# Display the new features
print(data[['price_diff', 'ma_5', 'ma_10', 'ma_30', 'volatility', 'pct_return', 'price_direction']].head())


output_file_path = '../data/processed/indexData_processed2.csv'
data.to_csv(output_file_path, index=False)







####################################################
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the modified data with new features
data = pd.read_csv('../data/processed/indexData_processed2.csv')

# Prepare the features and target variable
features = ['price_diff', 'ma_5', 'ma_10', 'ma_30', 'volatility', 'pct_return']
X = data[features]
y = data['price_direction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
log_reg_accuracy = accuracy_score(y_test, y_pred_log)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

log_reg_report = classification_report(y_test, y_pred_log)
rf_report = classification_report(y_test, y_pred_rf)

# Output the results
print("Logistic Regression Accuracy:", log_reg_accuracy)
print("\nRandom Forest Accuracy:", rf_accuracy)
print("\nClassification Report for Logistic Regression:\n", log_reg_report)
print("\nClassification Report for Random Forest:\n", rf_report)








    price_diff        ma_5       ma_10       ma_30  volatility  pct_return  \
29         0.0  537.566003  534.946002  534.558667         0.0   -0.002741   
30         0.0  538.138001  535.210004  534.879333         0.0   -0.000186   
31         0.0  538.244006  535.654004  535.231667         0.0   -0.000985   
32         0.0  537.968005  536.288007  535.485333         0.0   -0.004333   
33         0.0  537.059998  536.679004  535.626333         0.0   -0.000187   

    price_direction  
29                0  
30                0  
31                0  
32                0  
33                0  
Logistic Regression Accuracy: 0.5289604192008983

Random Forest Accuracy: 0.5140357443623094

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.53      0.01      0.01     10076
           1       0.53      1.00      0.69     11298

    accuracy                           0.53     21374
   macro avg       0.53      0.50      