In [5]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import pandas as pd

# Load your data
X = pd.read_csv("data/train_features.csv", index_col=0)
y = pd.read_csv("data/train_target.csv", index_col=0)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train your SVR model with the RBF kernel
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train, y_train)

# Calculate and print the R-squared score
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Calculate feature importances using permutation importance
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

# Get feature importances
importances = perm_importance.importances_mean

# Pair feature names with their importances
feature_names = X.columns
feature_importances = list(zip(feature_names, importances))

# Sort feature importances in descending order
sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Print the most important features
for feature, importance in sorted_feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")


  y = column_or_1d(y, warn=True)


Mean Squared Error: 14.500409172440818
R-squared: 0.001036444647256718
Feature: CL=F, Importance: 0.008524010146332462
Feature: 000001.SS, Importance: 0.004995977996405195
Feature: 000001.SS_ch, Importance: 0.0036955582990245224
Feature: ^N225_ch, Importance: 0.0034735870214927596
Feature: GC=F, Importance: 0.0031737626273991458
Feature: volume_ch, Importance: 0.00313858730134845
Feature: ^N225, Importance: 0.0023746946720944093
Feature: ^N100, Importance: 0.0020009681023409388
Feature: GC=F_ch, Importance: 0.0009988611986898302
Feature: open_ch, Importance: 0.0005357030536394802
Feature: ^DJI, Importance: 0.00038253958034100886
Feature: volume, Importance: 0.00035850468199169667
Feature: HG=F_ch, Importance: -3.7204663828149796e-05
Feature: AAPL_ch, Importance: -0.0010938202194113419
Feature: NVDA_ch, Importance: -0.0020599321131733314


In [3]:
import pandas as pd

# Create a DataFrame with the feature importances
importance_df = pd.DataFrame(sorted_feature_importances, columns=['Feature', 'Importance'])

# Set a threshold for importance
threshold = 0.0001  # You can adjust this threshold based on your preference

# Filter features with importance above the threshold
selected_features = importance_df[importance_df['Importance'] > threshold]

# Extract the feature names that are selected
selected_feature_names = selected_features['Feature']

# Drop the least important features from your original DataFrame
X_selected = X[selected_feature_names]

# Now, X_selected contains only the selected features
X_selected

Unnamed: 0,CL=F,^N100,000001.SS_ch,000001.SS,GC=F,volume_ch,HG=F_ch,GC=F_ch,^N225,open_ch,volume,^DJI,AAPL_ch,NVDA_ch,^N225_ch
2014-09-19,1.727323,-1.324818,0.522846,-2.378455,-0.952315,0.126741,-0.122044,-1.121153,-1.461648,-1.984711,-0.853183,-1.362974,-0.582350,-0.804450,1.499617
2014-09-20,1.727323,-1.324818,-0.013193,-2.378455,-0.952315,-0.217110,-0.009643,-0.022263,-1.461648,-1.922540,-0.853238,-1.362974,-0.046092,-0.066117,-0.025283
2014-09-21,1.727323,-1.324818,-0.013193,-2.378455,-0.952315,-0.894279,-0.009643,-0.022263,-1.461648,0.863609,-0.853772,-1.362974,-0.046092,-0.066117,-0.025283
2014-09-22,1.678999,-1.348030,-1.573203,-2.487532,-0.947202,-0.390825,-1.430568,0.137589,-1.487321,-0.645680,-0.853899,-1.380079,0.019047,-0.463145,-0.707631
2014-09-23,1.681171,-1.426269,0.782683,-2.432829,-0.932886,2.201991,-0.109766,0.424764,-1.487321,0.149778,-0.852810,-1.398742,0.982109,-0.234968,-0.025283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-16,0.804830,1.943364,-0.013193,0.124238,1.586411,0.027942,-0.009643,-0.022263,2.117498,-0.082762,-0.412722,1.389752,-0.046092,-0.066117,-0.025283
2023-07-17,0.735874,1.874161,-0.809091,0.046890,1.560165,1.353683,-2.003815,-0.531030,2.117498,-0.094837,-0.167334,1.401945,1.092023,0.803726,-0.025283
2023-07-18,0.822748,1.902077,-0.350977,0.014348,1.644698,-0.162629,-0.372490,1.622831,2.140356,-0.143650,-0.172595,1.460514,-0.134239,0.820340,0.280838
2023-07-19,0.801029,1.920276,0.016089,0.017159,1.645721,-0.349363,-0.385397,-0.002609,2.229922,-0.307087,-0.225079,1.477974,0.418987,-0.416178,1.170409


In [4]:
X_selected.to_csv(f"data/train_features.csv", index=True)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1]
}

# Create the SVR model
svr = SVR(kernel='rbf')

# Perform grid search
grid_search = GridSearchCV(svr, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Fit the best model to your data
best_model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu