In [1]:
import os
import pandas as pd
import sklearn
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import glob
import pandas as pd
import os

csv_files = glob.glob('stock_features/*.csv')
df_list = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Extract stock symbol from file name (e.g., 'AAPL.csv' â†’ 'AAPL')
    symbol = os.path.basename(file).replace('.csv', '')
    
    # Add symbol column
    df['symbol'] = symbol
    
    df_list.append(df)

# Combine all DataFrames
df = pd.concat(df_list, ignore_index=True)


In [3]:
df['Future_Close'] = df.groupby('symbol')['Close'].shift(-1)
df['Target'] = (df['Future_Close'] > df['Close']).astype(int)

In [4]:
essential_cols = [
    'Future_Close',  # Needed to compute Target
    'return_1d', 'return_5d', 'log_return',
    'sma_20', 
    'ema_9', 'ema_26',
    'RSI_14', 'RSI_30',
    'roc_10', 'slope_10',
    'rolling_std_10', 'rolling_std_20',
    'atr_14',
    'bollinger_high', 'bollinger_low', 'bb_width',
    'volume_change_1d', 'volume_spike',
    'obv',
    'candle_body', 'upper_wick', 'lower_wick', 'candle_range', 'body_ratio', 'doji_flag',
    'close_lag1', 'close_lag2', 'close_lag3','close_lag4', 'close_lag5',
    'return_lag1', 'return_lag2', 'return_lag3','return_lag4', 'return_lag5',
]

# Drop rows where any essential value is missing
df.dropna(subset=essential_cols, inplace=True)


In [5]:

features = ['return_1d', 'return_5d', 'return_10d', 'log_return',
            'sma_20', 'sma_50','sma_200',
            'ema_9', 'ema_26', 'ema_50',
            'macd_9', 'macd_signal_9', 'macd_hist_9',
            'roc_10', 'slope_10', 'rolling_std_10','rolling_std_20','rolling_std_50',
            'atr_14', 'RSI_14', 'RSI_30',
            'bollinger_high', 'bollinger_low', 'bb_width',
            'zscore_20', 'volume_change_1d', 'volume_avg_20','volume_spike',
            'obv', 'OBV_100', 'OBV_50',
            'candle_body','upper_wick', 'lower_wick' ,'candle_range','body_ratio','doji_flag',
            'close_lag1', 'close_lag2', 'close_lag3','close_lag4', 'close_lag5',
            'return_lag1', 'return_lag2', 'return_lag3','return_lag4', 'return_lag5',
            'Stoch_Osc_14', 'Stoch_Signal_14',
            'fib_382', 'fib_500', 'fib_618' ]


In [6]:
train_list = []
test_list = []

for symbol, group in df.groupby('symbol'):
    group = group.sort_values('Date')  # ensure time order
    split_idx = int(len(group) * 0.8)
    
    train_list.append(group.iloc[:split_idx])
    test_list.append(group.iloc[split_idx:])

# Combine all stocks' splits
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

# Final features/labels
X_train = train_df[features]
y_train = train_df['Target']

X_test = test_df[features]
y_test = test_df['Target']


In [7]:
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Training data shape: (65604, 52), (65604,)
Testing data shape: (16434, 52), (16434,)


In [8]:
model = XGBClassifier(objective='binary:logistic',
                      eval_metric='logloss',
                      use_label_encoder=False, # Suppress the warning
                      n_estimators=100,      # Number of boosting rounds (trees)
                      learning_rate=0.1,     # Step size shrinkage to prevent overfitting
                      max_depth=3,           # Maximum depth of a tree
                      subsample=0.8,         # Subsample ratio of the training instance
                      colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
                      random_state=42)       # For reproducibility

# Train the model


In [9]:
model.fit(X_train, y_train)
print("XGBoost model trained successfully!")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model trained successfully!


In [12]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probabilities for the positive class (Bullish)
# print (f"Predictions made on the test set: {len(y_pred)} samples")
# print (f"Predicted probabilities for the positive class: {y_pred_proba[:5]}...")  # Display first 5 probabilities
print("\n--- Model Evaluation ---")
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy: 0.5120

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.24      0.32      7651
           1       0.53      0.75      0.62      8783

    accuracy                           0.51     16434
   macro avg       0.49      0.49      0.47     16434
weighted avg       0.50      0.51      0.48     16434



In [11]:
import pandas as pd
import os
import glob

predictions = []

# Loop through all stock CSVs
for file_path in glob.glob('stock_features/*.csv'):
    stock_name = os.path.basename(file_path).replace('.csv', '')
    
    try:
        df = pd.read_csv(file_path)
        df = df.dropna()

        # Get the same features used in training
        X_stock = df[model.feature_names_in_]
        
        # Select the latest row (for next-day prediction)
        latest_row = X_stock.iloc[-1:]
        
        # Predict
        pred_class = model.predict(latest_row)[0]
        pred_proba = model.predict_proba(latest_row)[0][1]  # Probability of 'Up'

        # Store result
        predictions.append({
            'Stock': stock_name,
            'Prediction': int(pred_class),
            'Probability_Up': round(pred_proba, 4)
        })
    
    except Exception as e:
        print(f"Error processing {stock_name}: {e}")

# Convert to DataFrame and sort by confidence
results_df = pd.DataFrame(predictions).sort_values(by='Probability_Up', ascending=False)

# Show top N stocks likely to go up
print(results_df.head(33))


    Stock  Prediction  Probability_Up
27     VZ           1          0.5364
9    SBUX           1          0.5227
15    RTX           1          0.5224
12   AMZN           1          0.5204
25  GOOGL           1          0.5199
13     KO           1          0.5192
5     BAC           1          0.5190
0    CSCO           1          0.5185
8     LOW           1          0.5182
31    WMT           1          0.5174
26    PFE           1          0.5150
21     GE           1          0.5147
11    MCD           1          0.5127
17    CVX           1          0.5114
23    JPM           1          0.5083
2    GILD           1          0.5080
19    WFC           1          0.5063
4       T           1          0.5032
6     PEP           1          0.5003
16    XOM           0          0.4991
20    DIS           0          0.4987
30   AAPL           0          0.4983
32    JNJ           0          0.4974
18    IBM           0          0.4971
28   AMGN           0          0.4965
3       V   