In [1]:
import pandas as pd
import numpy as np
import os

In [3]:

# Define the folder path
folder_path = 'nifty_50'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        # Extract company name (removing '.csv' and converting to lowercase)
        company_name = file_name.replace('.csv', '').lower()
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, file_name))
        # Assign the DataFrame to the dictionary with the formatted name
        dataframes[f"df_{company_name}"] = df

# Access individual dataframes using dataframes['df_companyname']

In [1]:
dataframes

NameError: name 'dataframes' is not defined

In [5]:
# Initialize a list to store processed data
all_data = []

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
import talib
# Loop through each company's DataFrame
for key, df in dataframes.items():
    # Ensure DataFrame is sorted by Date
    df = df.sort_values('Date')
    
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    # Extract necessary columns and rename them if needed
    df = df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume']]
    
    # Handle missing values
    df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)
    
    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Calculate technical indicators
    # 1. MACD
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(
        df['Close'], fastperiod=12, slowperiod=26, signalperiod=9
    )
    
    # 2. RSI
    df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
    
    # 3. Bollinger Bands
    df['Upper_BB'], df['Middle_BB'], df['Lower_BB'] = talib.BBANDS(
        df['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0
    )
    
    # 4. Stochastic Oscillator
    df['SlowK'], df['SlowD'] = talib.STOCH(
        df['High'], df['Low'], df['Close'],
        fastk_period=14, slowk_period=3, slowk_matype=0,
        slowd_period=3, slowd_matype=0
    )
    
    # Fill NaN values resulting from indicator calculations
    df.fillna(method='bfill', inplace=True)
    df.fillna(method='ffill', inplace=True)
    
    # Calculate Future Price Change Percentage (e.g., 5 days ahead)
    prediction_window = 5
    df['Future_Close'] = df['Close'].shift(-prediction_window)
    df['Price_Change_Percent'] = ((df['Future_Close'] - df['Close']) / df['Close']) * 100
    
    # Assign labels based on Price Change and MACD
    def assign_label(row):
        macd_bullish = row['MACD'] > row['MACD_Signal']
        macd_bearish = row['MACD'] < row['MACD_Signal']
        if (row['Price_Change_Percent'] > 2.0) and macd_bullish:
            return 'Buy'
        elif (row['Price_Change_Percent'] < -2.0) and macd_bearish:
            return 'Sell'
        else:
            return 'Hold'
    
    df['Target'] = df.apply(assign_label, axis=1)
    
    # Drop rows with NaN in 'Target' or 'Future_Close'
    df.dropna(subset=['Target', 'Future_Close'], inplace=True)
    
    # Append processed DataFrame to the list
    all_data.append(df)

In [24]:
# 2. Combine Data from All Companies
combined_df = pd.concat(all_data, ignore_index=True)


In [25]:
# 3. Sort Combined DataFrame by Date
combined_df = combined_df.sort_values('Date').reset_index(drop=True)

In [26]:
# 4. Define Features and Target
feature_columns = [
    'Open', 'High', 'Low', 'Close', 'Volume', 'MACD', 'MACD_Signal',
    'MACD_Hist', 'RSI', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'SlowK', 'SlowD'
]

X = combined_df[feature_columns]
y = combined_df['Target']

In [30]:
from sklearn.preprocessing import LabelEncoder
import joblib
# 5. Encode Target Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # 'Buy'=0, 'Hold'=1, 'Sell'=2

# Save label encoder for future use
if not os.path.exists('models'):
    os.makedirs('models')
joblib.dump(label_encoder, 'models/label_encoder.pkl')


['models/label_encoder.pkl']

In [31]:
# 6. Handle Missing Values in Features
X.fillna(method='bfill', inplace=True)
X.fillna(method='ffill', inplace=True)
X.dropna(inplace=True)

# Adjust y_encoded accordingly
y_encoded = y_encoded[X.index]


In [33]:
# 7. Initialize TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)


In [34]:
# 8. Initialize a Dictionary to Store Results for Each Model
model_metrics = {
    'Random Forest': [],
    'XGBoost': [],
    'LightGBM': [],
    'CatBoost': []
}

In [38]:
!pip3 install Counter

Collecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: Counter
  Building wheel for Counter (setup.py) ... [?25ldone
[?25h  Created wheel for Counter: filename=Counter-1.0.0-py3-none-any.whl size=5394 sha256=75b2829a11df38e922061860779a9f1e0aa5e1899ad5a19e3b09184acd6bcc91
  Stored in directory: /Users/arjunraizada/Library/Caches/pip/wheels/16/ff/7a/6e8bf2fdadb47c50a03bb4b9a59bd2b1da1b876faf8e3815d9
Successfully built Counter
Installing collected packages: Counter
Successfully installed Counter-1.0.0


In [48]:
# 9. Iterate Over Each Fold
from sklearn.preprocessing import  StandardScaler
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Fold {fold}")
    # Split the data
    X_train_cv = X.iloc[train_index]
    X_test_cv = X.iloc[test_index]
    y_train_cv = y_encoded[train_index]
    y_test_cv = y_encoded[test_index]
    
    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_cv)
    X_test_scaled = scaler.transform(X_test_cv)
    
    # Save scaler for each fold if needed
    # joblib.dump(scaler, f'models/scaler_fold_{fold}.pkl')
    
    # Handle Class Weights
    counter = Counter(y_train_cv)
    majority = max(counter.values())
    class_weights = {cls: float(majority)/count for cls, count in counter.items()}
    
    # Convert class_weights to a list for CatBoost
    class_weights_list = [class_weights[i] for i in sorted(class_weights.keys())]
    
    # ---------------------
    # Train Random Forest
    rf_model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight=class_weights
    )
    rf_model.fit(X_train_scaled, y_train_cv)
    y_pred_rf = rf_model.predict(X_test_scaled)
    acc_rf = accuracy_score(y_test_cv, y_pred_rf)
    f1_rf = f1_score(y_test_cv, y_pred_rf, average='macro')
    model_metrics['Random Forest'].append({'accuracy': acc_rf, 'f1_macro': f1_rf})
    
    # ---------------------
    # Train XGBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        objective='multi:softprob',
        num_class=3,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42,
        scale_pos_weight=1  # Adjusted if necessary
    )
    xgb_model.fit(X_train_scaled, y_train_cv)
    y_pred_xgb = xgb_model.predict(X_test_scaled)
    acc_xgb = accuracy_score(y_test_cv, y_pred_xgb)
    f1_xgb = f1_score(y_test_cv, y_pred_xgb, average='macro')
    model_metrics['XGBoost'].append({'accuracy': acc_xgb, 'f1_macro': f1_xgb})
    
    # ---------------------
    # Train LightGBM
    lgb_model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        objective='multiclass',
        class_weight=class_weights,
        random_state=42
    )
    lgb_model.fit(X_train_scaled, y_train_cv)
    y_pred_lgb = lgb_model.predict(X_test_scaled)
    acc_lgb = accuracy_score(y_test_cv, y_pred_lgb)
    f1_lgb = f1_score(y_test_cv, y_pred_lgb, average='macro')
    model_metrics['LightGBM'].append({'accuracy': acc_lgb, 'f1_macro': f1_lgb})
    
    # ---------------------
    # Train CatBoost
    cat_model = CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        loss_function='MultiClass',
        random_seed=42,
        verbose=0,
        class_weights=class_weights_list
    )
    cat_model.fit(X_train_scaled, y_train_cv)
    y_pred_cat = cat_model.predict(X_test_scaled)
    acc_cat = accuracy_score(y_test_cv, y_pred_cat)
    f1_cat = f1_score(y_test_cv, y_pred_cat, average='macro')
    model_metrics['CatBoost'].append({'accuracy': acc_cat, 'f1_macro': f1_cat})
    
    # Optionally, print classification reports for each model
    # print("Random Forest Classification Report:")
    # print(classification_report(y_test_cv, y_pred_rf, target_names=label_encoder.classes_))
    # Similarly for other models


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 78319, number of used features: 14
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Fold 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 156634, number of used features: 14
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Fold 3
[LightGBM] [

In [49]:
# 10. Aggregate and Display Results
def aggregate_metrics(metrics_list, model_name):
    accuracies = [m['accuracy'] for m in metrics_list]
    f1_scores = [m['f1_macro'] for m in metrics_list]
    print(f"=== {model_name} ===")
    print(f"Average Accuracy: {np.mean(accuracies):.4f}")
    print(f"Average Macro F1-Score: {np.mean(f1_scores):.4f}")
    print("\n")

for model_name, metrics in model_metrics.items():
    aggregate_metrics(metrics, model_name)

=== Random Forest ===
Average Accuracy: 0.6635
Average Macro F1-Score: 0.3604


=== XGBoost ===
Average Accuracy: 0.6849
Average Macro F1-Score: 0.3023


=== LightGBM ===
Average Accuracy: 0.3388
Average Macro F1-Score: 0.3537


=== CatBoost ===
Average Accuracy: 0.3064
Average Macro F1-Score: 0.3137




In [50]:
# 12. Train Final Models on Entire Training Data (excluding final test set)

# Get the indices for all folds
folds = list(tscv.split(X))
train_indices = []
for i in range(len(folds) - 1):  # Exclude the last fold for final testing
    train_indices.extend(folds[i][0])
final_test_index = folds[-1][1]

X_train_final = X.iloc[train_indices]
y_train_final = y_encoded[train_indices]
X_test_final = X.iloc[final_test_index]
y_test_final = y_encoded[final_test_index]

# Feature Scaling
scaler_final = StandardScaler()
X_train_final_scaled = scaler_final.fit_transform(X_train_final)
X_test_final_scaled = scaler_final.transform(X_test_final)

# Save the final scaler
joblib.dump(scaler_final, 'models/scaler_final.pkl')

# Handle Class Weights
counter_final = Counter(y_train_final)
majority_final = max(counter_final.values())
class_weights_final = {cls: float(majority_final)/count for cls, count in counter_final.items()}

# Convert class_weights to a list for CatBoost (if needed)
class_weights_list_final = [class_weights_final[i] for i in sorted(class_weights_final.keys())]


In [51]:
# Train the final Random Forest model
final_rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight=class_weights_final
)
final_rf_model.fit(X_train_final_scaled, y_train_final)

# Save the final Random Forest model
joblib.dump(final_rf_model, 'models/random_forest_final.pkl')

['models/random_forest_final.pkl']

In [54]:
from sklearn.metrics import classification_report, confusion_matrix
# Evaluate the final Random Forest model on the test set
y_pred_rf_final = final_rf_model.predict(X_test_final_scaled)
acc_rf_final = accuracy_score(y_test_final, y_pred_rf_final)
f1_rf_final = f1_score(y_test_final, y_pred_rf_final, average='macro')

print("=== Final Random Forest Model Evaluation ===")
print(f"Accuracy: {acc_rf_final:.4f}")
print(f"Macro F1-Score: {f1_rf_final:.4f}")
print("Classification Report:")
print(classification_report(y_test_final, y_pred_rf_final, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test_final, y_pred_rf_final))

=== Final Random Forest Model Evaluation ===
Accuracy: 0.6698
Macro F1-Score: 0.3479
Classification Report:
              precision    recall  f1-score   support

         Buy       0.36      0.10      0.16     12523
        Hold       0.70      0.93      0.80     54370
        Sell       0.29      0.05      0.09     11422

    accuracy                           0.67     78315
   macro avg       0.45      0.36      0.35     78315
weighted avg       0.58      0.67      0.59     78315

Confusion Matrix:
[[ 1250 11273     0]
 [ 2270 50586  1514]
 [    0 10802   620]]


In [55]:
# Train the final XGBoost model
final_xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    objective='multi:softprob',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    scale_pos_weight=1  # Adjusted if necessary
)
final_xgb_model.fit(X_train_final_scaled, y_train_final)

# Save the final XGBoost model
joblib.dump(final_xgb_model, 'models/xgboost_final.pkl')

['models/xgboost_final.pkl']

In [56]:
# Evaluate the final XGBoost model on the test set
y_pred_xgb_final = final_xgb_model.predict(X_test_final_scaled)
acc_xgb_final = accuracy_score(y_test_final, y_pred_xgb_final)
f1_xgb_final = f1_score(y_test_final, y_pred_xgb_final, average='macro')

print("=== Final XGBoost Model Evaluation ===")
print(f"Accuracy: {acc_xgb_final:.4f}")
print(f"Macro F1-Score: {f1_xgb_final:.4f}")
print("Classification Report:")
print(classification_report(y_test_final, y_pred_xgb_final, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test_final, y_pred_xgb_final))

=== Final XGBoost Model Evaluation ===
Accuracy: 0.6833
Macro F1-Score: 0.3144
Classification Report:
              precision    recall  f1-score   support

         Buy       0.46      0.03      0.05     12523
        Hold       0.70      0.97      0.81     54370
        Sell       0.29      0.05      0.09     11422

    accuracy                           0.68     78315
   macro avg       0.48      0.35      0.31     78315
weighted avg       0.60      0.68      0.58     78315

Confusion Matrix:
[[  320 12203     0]
 [  372 52622  1376]
 [    0 10850   572]]


In [57]:
from sklearn.ensemble import VotingClassifier

# Initialize the ensemble with the two models
final_voting_clf = VotingClassifier(
    estimators=[
        ('rf', final_rf_model),
        ('xgb', final_xgb_model)
    ],
    voting='soft'  # Use soft voting to average predicted probabilities
)

# Train the ensemble (since models are already trained, fit may not be necessary)
# But for safety, we can retrain on the same data
final_voting_clf.fit(X_train_final_scaled, y_train_final)

# Save the ensemble model
joblib.dump(final_voting_clf, 'models/voting_classifier_final.pkl')

# Evaluate the ensemble model on the test set
y_pred_voting_final = final_voting_clf.predict(X_test_final_scaled)
acc_voting_final = accuracy_score(y_test_final, y_pred_voting_final)
f1_voting_final = f1_score(y_test_final, y_pred_voting_final, average='macro')

print("=== Final Voting Classifier Model Evaluation ===")
print(f"Accuracy: {acc_voting_final:.4f}")
print(f"Macro F1-Score: {f1_voting_final:.4f}")
print("Classification Report:")
print(classification_report(y_test_final, y_pred_voting_final, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test_final, y_pred_voting_final))

=== Final Voting Classifier Model Evaluation ===
Accuracy: 0.6876
Macro F1-Score: 0.3136
Classification Report:
              precision    recall  f1-score   support

         Buy       0.43      0.04      0.07     12523
        Hold       0.70      0.97      0.81     54370
        Sell       0.33      0.03      0.06     11422

    accuracy                           0.69     78315
   macro avg       0.49      0.35      0.31     78315
weighted avg       0.60      0.69      0.58     78315

Confusion Matrix:
[[  482 12041     0]
 [  630 53008   732]
 [    0 11062   360]]


In [58]:
df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,MACD,MACD_Signal,MACD_Hist,RSI,Upper_BB,Middle_BB,Lower_BB,SlowK,SlowD,Future_Close,Price_Change_Percent,Target
0,2000-01-03,BAJAUTOFIN,49.45,50.75,46.5,50.75,7600,-0.241281,-0.674671,0.43339,30.788177,48.473312,41.515,34.556688,15.668777,21.354488,42.9,-15.46798,Hold
1,2000-01-04,BAJAUTOFIN,53.2,53.2,47.9,48.1,5000,-0.241281,-0.674671,0.43339,30.788177,48.473312,41.515,34.556688,15.668777,21.354488,40.1,-16.632017,Hold
2,2000-01-05,BAJAUTOFIN,46.55,47.4,44.6,44.6,3500,-0.241281,-0.674671,0.43339,30.788177,48.473312,41.515,34.556688,15.668777,21.354488,39.0,-12.556054,Hold
3,2000-01-06,BAJAUTOFIN,43.5,46.0,42.1,45.25,6200,-0.241281,-0.674671,0.43339,30.788177,48.473312,41.515,34.556688,15.668777,21.354488,39.5,-12.707182,Hold
4,2000-01-07,BAJAUTOFIN,48.0,48.0,42.0,42.9,3500,-0.241281,-0.674671,0.43339,30.788177,48.473312,41.515,34.556688,15.668777,21.354488,39.9,-6.993007,Hold
