In [2]:
import pandas as pd 
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
data = pd.read_csv('D:\Master_Folder\Data Science Course\Projects\StockMarket\stock_data\SUZLON.NS_2023-01-01_to_2024-11-21_ML.csv')

In [4]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data.set_index('Date', inplace=True)


  data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)


In [5]:
data['Next_1_day_close'] = data['Close'].shift(-1)
data.dropna(inplace=True)

In [6]:
data.head()

Unnamed: 0_level_0,Close,Upward_Downward_Probability,Temporal_Features,Cluster,Anomaly,Next_1_day_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-03-14,7.95,-1.011601,1.455739,2,1,7.95
2023-03-15,7.95,-1.011601,1.510488,2,1,7.9
2023-03-16,7.9,-2.639057,1.291494,2,1,8.0
2023-03-17,8.0,-2.639057,1.346242,2,1,7.75
2023-03-20,7.75,-2.639057,1.400991,2,1,7.8


In [7]:
def evaluate_feature_combinations(data, target_col, test_ratio=0.3):
    result = []

    features = [col for col in data.columns if col != target_col]
    target = data[target_col]

    train_size = int(len(data) * (1 - test_ratio))
    train_data = data[:train_size]
    test_data = data[train_size:]

    x_train_full = train_data[features]
    y_train = train_data[target_col]
    x_test_full = test_data[features]
    y_test = test_data[target_col]

    for r in range(1, len(features) + 1):
        for feature_subset in combinations(features, r):
            feature_subset = list(feature_subset)

            # Scale features
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train_full[feature_subset])
            x_test = scaler.transform(x_test_full[feature_subset])
            
            model = LinearRegression()
            model.fit(x_train, y_train)

            y_pred = model.predict(x_test)

            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mape = mape = (abs((y_test - y_pred) / y_test).mean()) * 100

            result.append({'Features': feature_subset, 'mse': mse, 'R2': r2, 'MAPE': mape, 'Coefficient': model.coef_ , 'Intercept': model.intercept_})

    sorted_results = sorted(result, key=lambda x: x['R2'], reverse=True)

    return sorted_results

In [8]:
results = evaluate_feature_combinations(data, target_col = 'Next_1_day_close')

In [11]:
for i, res in enumerate(results[:10], 1):
    print(f"Rank {i}:")
    print(f"Features: {res['Features']}")
    print(f"R²: {res['R2']:.4f}, MAPE: {res['MAPE']:.2f}%, MSE: {res['mse']:.4f}")
    print(f"Coefficient: {res['Coefficient']}, Intercept: {res['Intercept']}\n")

Rank 1:
Features: ['Close', 'Upward_Downward_Probability', 'Cluster', 'Anomaly']
R²: 0.9739, MAPE: 2.40%, MSE: 3.7252
Coefficient: [ 1.33587637e+01  2.48163321e-03 -4.00906832e-02 -8.50735096e-02], Intercept: 28.15103806228374

Rank 2:
Features: ['Close', 'Cluster', 'Anomaly']
R²: 0.9739, MAPE: 2.40%, MSE: 3.7258
Coefficient: [13.35876274 -0.0417342  -0.08503489], Intercept: 28.15103806228374

Rank 3:
Features: ['Close', 'Upward_Downward_Probability', 'Temporal_Features', 'Cluster', 'Anomaly']
R²: 0.9739, MAPE: 2.41%, MSE: 3.7342
Coefficient: [ 1.33571786e+01  1.33150851e-02 -3.65546634e-02 -4.60922835e-03
 -8.46135841e-02], Intercept: 28.15103806228374

Rank 4:
Features: ['Close', 'Upward_Downward_Probability', 'Temporal_Features', 'Anomaly']
R²: 0.9738, MAPE: 2.41%, MSE: 3.7355
Coefficient: [13.35707616  0.01535674 -0.03934329 -0.08430575], Intercept: 28.15103806228374

Rank 5:
Features: ['Close', 'Temporal_Features', 'Cluster', 'Anomaly']
R²: 0.9738, MAPE: 2.41%, MSE: 3.7362
Coeffic