In [27]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.model_selection import train_test_split


In [28]:
data_folder = "data"
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".csv")]


In [29]:

if not csv_files:
    raise FileNotFoundError("No CSV files found in the data folder")

dfs = []
for f in csv_files:
    path = os.path.join(data_folder, f)
    df = pd.read_csv(path)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df = df.dropna(subset=["value", "classification"])
df = df.sort_values("timestamp").reset_index(drop=True)


In [30]:

if df.empty:
    raise ValueError("Dataset is empty after dropping missing values in 'value' or 'classification'.")

X = df[["value"]]
y = df["classification"]


In [31]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=False
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

base_model = LogisticRegression(class_weight="balanced", max_iter=1000)
model = CalibratedClassifierCV(base_model, method="sigmoid", cv=3)
model.fit(X_train, y_train)

y_prob_all = model.predict_proba(X_test)


In [32]:

if len(y.unique()) == 2:
    y_prob = y_prob_all[:, 1]
    roc = roc_auc_score(y_test, y_prob)
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
    f1 = (2 * precision * recall) / (precision + recall + 1e-9)
    best_threshold = thresholds[np.argmax(f1)]
    y_pred = (y_prob >= best_threshold).astype(int)
else:
    y_prob = y_prob_all
    roc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    best_threshold = "N/A"
    y_pred = model.predict(X_test)


In [33]:

print("ROC-AUC:", roc)
print("Optimal Threshold:", best_threshold)
print(classification_report(y_test, y_pred))

out = pd.DataFrame({
    "value": X_test.flatten(),
    "actual": y_test.values,
    "probability": y_prob[:, 1] if len(y.unique()) == 2 else None,
    "prediction": y_pred
})

out.to_csv("final_predictions.csv", index=False)


ROC-AUC: 0.9711765915442335
Optimal Threshold: N/A
               precision    recall  f1-score   support

 Extreme Fear       0.65      1.00      0.79        15
Extreme Greed       1.00      0.81      0.90       116
         Fear       0.45      0.94      0.61       127
        Greed       0.92      0.97      0.95       268
      Neutral       0.00      0.00      0.00       135

     accuracy                           0.74       661
    macro avg       0.61      0.74      0.65       661
 weighted avg       0.65      0.74      0.68       661



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
