In [30]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42


In [31]:
df = pd.read_csv("master_dataset_ml_ready_labelled.csv")

df.shape, df.head()


((5202, 21),
                   date  CDD_TX  CDD_PA  CDD_IL  CDD_NY  HDD_TX  HDD_PA  \
 0  2010-01-01 00:00:00       0       0       0       0      22      33   
 1  2010-01-02 00:00:00       0       0       0       0      24      43   
 2  2010-01-03 00:00:00       0       0       0       0      24      48   
 3  2010-01-04 00:00:00       0       0       0       0      26      45   
 4  2010-01-05 00:00:00       0       0       0       0      28      42   
 
    HDD_IL  HDD_NY  contract_1_price  ...  spot_price  storage_bcf  \
 0      56      33              5.88  ...        6.09         3117   
 1      61      39              5.88  ...        6.09         3117   
 2      60      49              5.88  ...        6.09         3117   
 3      58      45              5.88  ...        6.09         3117   
 4      54      44              5.64  ...        6.19         3117   
 
    us_gas_rigs  year  month  day_of_year  day_of_week  quarter  \
 0          759  2010      1            1     

In [32]:
df.info()
df.isnull().sum()
df.duplicated().sum()


<class 'pandas.DataFrame'>
RangeIndex: 5202 entries, 0 to 5201
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   5202 non-null   str    
 1   CDD_TX                 5202 non-null   int64  
 2   CDD_PA                 5202 non-null   int64  
 3   CDD_IL                 5202 non-null   int64  
 4   CDD_NY                 5202 non-null   int64  
 5   HDD_TX                 5202 non-null   int64  
 6   HDD_PA                 5202 non-null   int64  
 7   HDD_IL                 5202 non-null   int64  
 8   HDD_NY                 5202 non-null   int64  
 9   contract_1_price       5202 non-null   float64
 10  contract_2_price       5202 non-null   float64
 11  spot_price             5202 non-null   float64
 12  storage_bcf            5202 non-null   int64  
 13  us_gas_rigs            5202 non-null   int64  
 14  year                   5202 non-null   int64  
 15  month          

np.int64(0)

In [33]:
# Target variable (classification label)
# This is the variable we want to predict
y = df["price_movement_scaled"]

# Feature matrix
# We drop:
# 1. price_movement_scaled -> target variable (prevents data leakage)
# 2. price_movement_raw    -> continuous form of target (prevents label leakage)
# 3. date                  -> non-numeric and redundant (time features already exist)
X = df.drop(
    columns=["price_movement_scaled", "price_movement_raw", "date"],
    errors="ignore"
)

print("X shape:", X.shape)
print("y shape:", y.shape)
X.head()

X shape: (5202, 18)
y shape: (5202,)


Unnamed: 0,CDD_TX,CDD_PA,CDD_IL,CDD_NY,HDD_TX,HDD_PA,HDD_IL,HDD_NY,contract_1_price,contract_2_price,spot_price,storage_bcf,us_gas_rigs,year,month,day_of_year,day_of_week,quarter
0,0,0,0,0,22,33,56,33,5.88,5.84,6.09,3117,759,2010,1,1,4,1
1,0,0,0,0,24,43,61,39,5.88,5.84,6.09,3117,759,2010,1,2,5,1
2,0,0,0,0,24,48,60,49,5.88,5.84,6.09,3117,759,2010,1,3,6,1
3,0,0,0,0,26,45,58,45,5.88,5.84,6.09,3117,759,2010,1,4,0,1
4,0,0,0,0,28,42,54,44,5.64,5.59,6.19,3117,759,2010,1,5,1,1


In [34]:
y.value_counts(normalize=True)
# Check class distribution

price_movement_scaled
 0    0.393310
-1    0.306613
 1    0.300077
Name: proportion, dtype: float64

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,          # 80% train, 20% test
    stratify=y,             # preserve class proportions
    random_state=42         # reproducibility
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

print("\nTrain class distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest class distribution:")
print(y_test.value_counts(normalize=True))


Train shape: (4161, 18) (4161,)
Test shape: (1041, 18) (1041,)

Train class distribution:
price_movement_scaled
 0    0.393175
-1    0.306657
 1    0.300168
Name: proportion, dtype: float64

Test class distribution:
price_movement_scaled
 0    0.393852
-1    0.306436
 1    0.299712
Name: proportion, dtype: float64


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

baseline_models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(
            max_iter=2000,
            random_state=42
        ))
    ]),
    
    "Decision Tree": DecisionTreeClassifier(
        random_state=42
    ),
    
    "Random Forest": RandomForestClassifier(
        random_state=42
    ),
    
    "SGD": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SGDClassifier(
            random_state=42
        ))
    ]),
    
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(
            random_state=42
        ))
    ])
}


In [37]:
from sklearn.metrics import accuracy_score, f1_score

baseline_results = []

for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    baseline_results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "F1_macro": f1_score(y_test, preds, average="macro")
    })

baseline_results_df = pd.DataFrame(baseline_results)
baseline_results_df


Unnamed: 0,Model,Accuracy,F1_macro
0,Logistic Regression,0.56292,0.526316
1,Decision Tree,0.580211,0.560572
2,Random Forest,0.603266,0.592001
3,SGD,0.540826,0.499043
4,SVM,0.588857,0.575513


In [38]:
from sklearn.model_selection import GridSearchCV

lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        max_iter=3000,
        random_state=42
    ))
])

lr_param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"]
}

lr_grid = GridSearchCV(
    lr_pipeline,
    lr_param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)

print("Best parameters:", lr_grid.best_params_)


Best parameters: {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}




In [39]:
best_lr = lr_grid.best_estimator_

lr_preds = best_lr.predict(X_test)

print("Tuned Logistic Regression")
print("Accuracy:", accuracy_score(y_test, lr_preds))
print("F1_macro:", f1_score(y_test, lr_preds, average="macro"))


Tuned Logistic Regression
Accuracy: 0.5619596541786743
F1_macro: 0.5240384681400494


### Logistic Regression: Baseline vs Tuned Comparison

In this step, Logistic Regression was evaluated before and after hyperparameter tuning using GridSearchCV.

The baseline Logistic Regression model achieved an accuracy of approximately 56% and a macro F1-score of approximately 0.53. After tuning the regularization parameter using GridSearchCV, the performance remained nearly unchanged.

This indicates that Logistic Regression did not benefit significantly from hyperparameter tuning for this dataset. This suggests that the modelâ€™s linear decision boundary limits its ability to capture the underlying patterns in the data, rather than the choice of hyperparameters.

Therefore, further performance improvements for this dataset are more likely to come from non-linear models rather than additional tuning of Logistic Regression.



In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state=42)

dt_param_grid = {
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 10, 50],
    "min_samples_leaf": [1, 5, 10],
    "criterion": ["gini", "entropy"]
}


In [41]:
dt_grid = GridSearchCV(
    dt,
    dt_param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1
)

dt_grid.fit(X_train, y_train)

print("Best Decision Tree parameters:")
print(dt_grid.best_params_)


Best Decision Tree parameters:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [42]:
from sklearn.metrics import accuracy_score, f1_score

best_dt = dt_grid.best_estimator_

dt_preds = best_dt.predict(X_test)

print("Tuned Decision Tree Results")
print("Accuracy:", accuracy_score(y_test, dt_preds))
print("F1_macro:", f1_score(y_test, dt_preds, average="macro"))


Tuned Decision Tree Results
Accuracy: 0.633045148895293
F1_macro: 0.6299798209052861


### Decision Tree: Baseline vs Tuned Comparison
The Decision Tree classifier showed a significant improvement after hyperparameter tuning using GridSearchCV.

The baseline Decision Tree achieved an accuracy of approximately 58% and a macro F1-score of approximately 0.56. After tuning parameters such as maximum depth, minimum samples per split, and minimum samples per leaf, the tuned model achieved an accuracy of approximately 63% and a macro F1-score of approximately 0.63.

This improvement indicates that controlling the complexity of the Decision Tree helps reduce overfitting and allows the model to generalize better on unseen data. Unlike Logistic Regression, the Decision Tree benefits substantially from hyperparameter tuning.


In [43]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

rf_param_grid = {
    "n_estimators": [200, 500],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 5],
    "max_features": ["sqrt", "log2"]
}


In [44]:
rf_grid = GridSearchCV(
    rf,
    rf_param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print("Best Random Forest parameters:")
print(rf_grid.best_params_)


Best Random Forest parameters:
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [45]:
best_rf = rf_grid.best_estimator_

rf_preds = best_rf.predict(X_test)

print("Tuned Random Forest Results")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("F1_macro:", f1_score(y_test, rf_preds, average="macro"))


Tuned Random Forest Results
Accuracy: 0.6205571565802114
F1_macro: 0.6173504342049523


### Random Forest: Baseline vs Tuned Comparison
The Random Forest classifier showed a moderate improvement after hyperparameter tuning using GridSearchCV.

The baseline Random Forest achieved an accuracy of approximately 60% and a macro F1-score of approximately 0.59. After tuning the number of trees, maximum depth, minimum samples per split, and feature selection strategy, the tuned model achieved an accuracy of approximately 62% and a macro F1-score of approximately 0.62.

While the tuned Random Forest improved upon its baseline performance, the improvement was less pronounced compared to the Decision Tree. This suggests that Random Forest was already relatively well-configured by default, and hyperparameter tuning provided incremental gains rather than dramatic improvements.


In [46]:
from sklearn.linear_model import SGDClassifier

sgd_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SGDClassifier(
        random_state=42
    ))
])

sgd_param_grid = {
    "model__loss": ["hinge", "log_loss"],
    "model__alpha": [1e-4, 1e-3, 1e-2],
    "model__penalty": ["l2", "l1", "elasticnet"]
}


In [47]:
sgd_grid = GridSearchCV(
    sgd_pipeline,
    sgd_param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1
)

sgd_grid.fit(X_train, y_train)

print("Best SGD parameters:")
print(sgd_grid.best_params_)


Best SGD parameters:
{'model__alpha': 0.0001, 'model__loss': 'log_loss', 'model__penalty': 'elasticnet'}


In [48]:
best_sgd = sgd_grid.best_estimator_

sgd_preds = best_sgd.predict(X_test)

print("Tuned SGD Results")
print("Accuracy:", accuracy_score(y_test, sgd_preds))
print("F1_macro:", f1_score(y_test, sgd_preds, average="macro"))


Tuned SGD Results
Accuracy: 0.5667627281460135
F1_macro: 0.5402883583600904


### SGD Classifier: Baseline vs Tuned Comparison
The SGD classifier showed an improvement in performance after hyperparameter tuning using GridSearchCV.

The baseline SGD model achieved an accuracy of approximately 54% and a macro F1-score of approximately 0.50. After tuning the loss function, regularization strength, and penalty type, the tuned model achieved an accuracy of approximately 57% and a macro F1-score of approximately 0.54.

Although hyperparameter tuning improved the performance of the SGD classifier, its overall performance remained lower than tree-based models. This suggests that linear models optimized via stochastic gradient descent may be insufficient to fully capture the non-linear relationships present in the dataset.


In [49]:
from sklearn.svm import SVC

svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(random_state=42))
])

svm_param_grid = {
    "model__C": [0.1, 1, 10],
    "model__kernel": ["rbf", "linear"],
    "model__gamma": ["scale", "auto"]
}


In [50]:
svm_grid = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1
)

svm_grid.fit(X_train, y_train)

print("Best SVM parameters:")
print(svm_grid.best_params_)


Best SVM parameters:
{'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}


In [51]:
best_svm = svm_grid.best_estimator_

svm_preds = best_svm.predict(X_test)

print("Tuned SVM Results")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print("F1_macro:", f1_score(y_test, svm_preds, average="macro"))


Tuned SVM Results
Accuracy: 0.6013448607108549
F1_macro: 0.5922613559530395


### Support Vector Machine: Baseline vs Tuned Comparison
The Support Vector Machine (SVM) classifier showed a moderate improvement in performance after hyperparameter tuning using GridSearchCV.

The baseline SVM achieved an accuracy of approximately 59% and a macro F1-score of approximately 0.58. After tuning the regularization parameter and kernel type, the tuned SVM achieved an accuracy of approximately 60% and a macro F1-score of approximately 0.59.

This improvement indicates that SVM benefits from hyperparameter tuning; however, its performance remains slightly lower than tree-based models. This suggests that while SVM can model complex decision boundaries, ensemble and tree-based approaches are more effective for this dataset.


### Summary of GridSearchCV Results
GridSearchCV was applied to all five classifiers to evaluate whether hyperparameter tuning could improve model performance.

Among the models tested, Decision Tree benefited the most from tuning, achieving the highest macro F1-score. Random Forest and SVM showed moderate improvements after tuning, while SGD showed limited improvement. Logistic Regression did not significantly benefit from hyperparameter tuning, indicating that model complexity rather than hyperparameter choice limited its performance.

Overall, tree-based models demonstrated superior performance compared to linear models for this classification task.


# Feature Selection

### Random Feature Selection

In [52]:
# Set seed for reproducibility
np.random.seed(42)

# Number of features to remove
num_features_to_remove = int(0.3 * X_train.shape[1])

# Randomly choose features to drop
random_features_to_drop = np.random.choice(
    X_train.columns,
    size=num_features_to_remove,
    replace=False
)

print("Randomly removed features:")
print(list(random_features_to_drop))

# Reduced feature sets
X_train_rand = X_train.drop(columns=random_features_to_drop)
X_test_rand = X_test.drop(columns=random_features_to_drop)

print("New feature count:", X_train_rand.shape[1])


Randomly removed features:
['CDD_TX', 'CDD_PA', 'contract_1_price', 'HDD_PA', 'CDD_NY']
New feature count: 13


In [53]:
random_removal_results = []

# Tuned models from earlier
models_reduced = {
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "SVM": best_svm
}

for name, model in models_reduced.items():
    model.fit(X_train_rand, y_train)
    preds = model.predict(X_test_rand)

    random_removal_results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "F1_macro": f1_score(y_test, preds, average="macro")
    })

pd.DataFrame(random_removal_results)


Unnamed: 0,Model,Accuracy,F1_macro
0,Decision Tree,0.634966,0.631746
1,Random Forest,0.614793,0.611068
2,SVM,0.60903,0.601504


### Random Feature Removal Experiment
In this experiment, approximately 30% of the input features were randomly removed to evaluate model robustness and sensitivity to feature reduction.

The results show that random feature removal did not significantly degrade model performance. The Decision Tree and SVM models showed slight improvements in macro F1-score, while Random Forest experienced a small decrease in performance.

These results suggest that the dataset contains redundant or weakly informative features, and that some models can benefit from a reduced feature space. Random Forest, however, relies more on feature diversity and was slightly more sensitive to random feature removal.


### Hypothesis-Based Feature Removal

Weather-related features (CDD/HDD) play a significant role in predicting price movement.
Removing them should negatively impact model performance.

In [54]:
weather_features = [
    "CDD_TX", "CDD_PA", "CDD_IL", "CDD_NY",
    "HDD_TX", "HDD_PA", "HDD_IL", "HDD_NY"
]

X_train_hyp = X_train.drop(columns=weather_features)
X_test_hyp = X_test.drop(columns=weather_features)

print("Remaining feature count:", X_train_hyp.shape[1])


Remaining feature count: 10


In [55]:
hypothesis_results = []

for name, model in models_reduced.items():
    model.fit(X_train_hyp, y_train)
    preds = model.predict(X_test_hyp)

    hypothesis_results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "F1_macro": f1_score(y_test, preds, average="macro")
    })

pd.DataFrame(hypothesis_results)


Unnamed: 0,Model,Accuracy,F1_macro
0,Decision Tree,0.637848,0.633057
1,Random Forest,0.619597,0.616159
2,SVM,0.617675,0.614479


In this experiment, weather-related features (CDD and HDD variables) were removed based on the hypothesis that they significantly influence price movement.

The results show that removing weather features did not reduce model performance. In fact, Decision Tree and SVM models showed slight improvements in macro F1-score, while Random Forest experienced only a marginal decrease.

These findings suggest that weather-related variables are not dominant predictors in this dataset and that price movement can be effectively modeled using price, storage, supply, and time-based features. This highlights the presence of redundant or weakly informative features in the weather group.


In [56]:
from utils.auto_feature_selector import autoFeatureSelector

selected_features = autoFeatureSelector(
    X_train,
    y_train,
    num_feats=8,
    methods=["pearson", "chi-square", "rfe", "log-reg", "rf"]
)

print("Selected features:")
print(selected_features)





Selected features:
['spot_price', 'day_of_week', 'HDD_PA', 'contract_2_price', 'HDD_NY', 'contract_1_price', 'us_gas_rigs', 'CDD_TX']


In [57]:
X_train_auto = X_train[selected_features]
X_test_auto = X_test[selected_features]

auto_fs_results = []

for name, model in models_reduced.items():
    model.fit(X_train_auto, y_train)
    preds = model.predict(X_test_auto)

    auto_fs_results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "F1_macro": f1_score(y_test, preds, average="macro")
    })

pd.DataFrame(auto_fs_results)


Unnamed: 0,Model,Accuracy,F1_macro
0,Decision Tree,0.634006,0.631781
1,Random Forest,0.613833,0.610991
2,SVM,0.621518,0.618624


### Final Conclusion

This project evaluated the impact of feature selection on multiclass price movement prediction using multiple classification models. Experiments included baseline modeling with all features, random feature removal, hypothesis-driven feature removal, and automated feature selection.

The results demonstrate that reducing the feature set does not degrade predictive performance and, in several cases, improves model generalization. Decision Tree models showed consistent performance across all experiments, while Support Vector Machines benefited the most from automated feature selection. Random Forest models exhibited slight sensitivity to feature reduction due to their reliance on feature diversity.

Overall, the findings indicate that the dataset contains redundant or weakly informative features and that intelligent feature selection can simplify models while maintaining or improving predictive performance. The Decision Tree classifier emerged as the most robust and interpretable model for this task.
