# TSMC Model Comparison and Evaluation

In [2]:
# TSMC Closing Price Predictor and Model Evaluator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
#import streamlit as st
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
#from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

# Load dataset
df = pd.read_csv('tsmc2330-2_Robust.csv')

# Feature engineering
df['Lag1_Close'] = df['Close'].shift(1)
df['Lag1_Return'] = df['Close'].pct_change().shift(1)
df['Lag1_RSI'] = df['RSI'].shift(1)
df['Lag1_MACD'] = df['MACD'].shift(1)
df['Lag1_MACD_Signal'] = df['MACD_Signal'].shift(1)
df['Volume_Change'] = df['Volume'].pct_change().shift(1)
df['MA5_diff'] = (df['MA5'].shift(1) - df['MA10'].shift(1)) / df['MA10'].shift(1)
df['BB_Pressure'] = df['Close'].shift(1) / df['BB_Upper'].shift(1)
df['Target'] = (df['Close'] > df['Close'].shift(1)).astype(int)

features = [
    "Lag1_Close", "Lag1_Return", "Lag1_RSI", "Lag1_MACD", "Lag1_MACD_Signal",
    "Volume_Change", "MA5_diff", "BB_Pressure"
]

# Drop NA and define X, y
df_model = df[features + ['Target']].dropna()
X = df_model[features]
y = df_model['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

X_train = X_train.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
X_test  = X_test[X_train.columns]

# Scale features
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = X_train.values
#X_test_scaled = scaler.transform(X_test)
X_test_scaled = X_test.values

# Define models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"\n{name}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {auc:.4f}")

    # Plot feature importance if available
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_[0])
    else:
        continue

    plt.figure(figsize=(8, 5))
    sns.barplot(x=importances, y=features)
    plt.title(f'{name} Feature Importance')
    plt.tight_layout()
    plt.show()

# ROC comparison
plt.figure(figsize=(8, 6))
for name, model in models.items():
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    #y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Logistic Regression:
Accuracy: 0.5078
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000
ROC AUC: 0.6003


ValueError: All arrays must be of the same length

<Figure size 800x500 with 0 Axes>