In [None]:
print('Avishkar Dwivedi')

Avishkar Dwivedi


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


# Mount Data and set the path

In [None]:
from google.colab import drive

# Mount the drive (you will be asked to authorize)
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# Define the paths (Converted from 'G:\' to Colab's Linux path format)
train_path = '/content/drive/My Drive/hull-tactical-market-prediction/train.csv'
test_path = '/content/drive/My Drive/hull-tactical-market-prediction/test.csv'

print(f"Train path set to: {train_path}")
print(f"Test path set to: {test_path}")

Train path set to: /content/drive/My Drive/hull-tactical-market-prediction/train.csv
Test path set to: /content/drive/My Drive/hull-tactical-market-prediction/test.csv


# 1. Load and Clean the data and handle missing values

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# sort by date to ensure the time order
train_df = train_df.sort_values('date_id').reset_index(drop=True)

# Fill the missing values
# financial data often has gap. We use 'ffill'(Propagate last valid obervation forward)
# then fill any remaining initial NaNs as 0

train_df = train_df.ffill().fillna(0)
test_df = test_df.ffill().fillna(0)

print(f"Train shape:{train_df.shape}")
print(f"Test shape :{test_df.shape}")
print('data loaded and clean')

Train shape:(9021, 98)
Test shape :(10, 99)
data loaded and clean


# 2. Feature engineering

## Selects the input columns. We exclude targets and ID columns.

In [None]:
# we use all provided features: E(Economics) , I (interest) , M(Market),P(Price),V(Volatility)
# we exclude column that are targets or not predictive features

exclude_cols = [
    'date_id',
    'forward_returns',
    'risk_free_rate',
    'market_forward_excess_returns',
    'is_scored',
    'lagged_forward_returns',
    'lagged_risk_free_rate',
    'lagged_market_forward_excess_returns'
]

feature_cols = [c for c in train_df.columns if c not in exclude_cols]
target_col = 'market_forward_excess_returns'

print(f"slected {len(feature_cols)} features for training")


slected 94 features for training


# 3. Time Series split

## Split the data into training (past) and validation (recent future) . Strictly avoiding "lock ahead bias".

In [None]:
# we train on the first 85% of history and validate on the 15% recent.

split_idx = int(len(train_df)*0.85)

X_train = train_df.iloc[:split_idx][feature_cols]
Y_train = train_df.iloc[:split_idx][target_col]

X_valid = train_df.iloc[split_idx:][feature_cols]
Y_valid = train_df.iloc[split_idx:][target_col]

# save validation metadata (return,risk_free_rate) for scoring calculation later
valid_meta = train_df.iloc[split_idx:][['date_id','forward_returns','risk_free_rate']]

print(f"Training sample:{len(X_train)}")
print(f"validation sample:{len(X_valid)}")

Training sample:7667
validation sample:1354


# 4. Model training.

## Build the Gradient Boosting model. This algorithm is robust and handles non-linear market pattern.

In [None]:
print("Initializing HistGradientBoostingRegressor...")

# We use HistGradientBoosting because it is fast and accurate for tabular data
model = HistGradientBoostingRegressor(
    max_iter=300,          # Number of boosting trees
    learning_rate=0.01,    # Slower learning rate prevents overfitting
    max_depth=6,           # Limits tree depth to control complexity
    l2_regularization=1.0, # Regularization to reduce noise sensitivity
    random_state=42
)

print(f"training model....")
model.fit(X_train,Y_train)

print(f"training complete")

Initializing HistGradientBoostingRegressor...
training model....
training complete


# 5. Strategy optimization

## the custom scoring logic to find the best leverage multiplier (k) that maximizes Sharpe Ratio without triggering the volatility penalty.

In [None]:
# optimizing betting strategy (finding 'k')

# 1. Generate the raw prediction (Alpha)
valid_preds = model.predict(X_valid)

# 2. Define the scoring function
def calculate_sharpe_score(predictions,meta_df):
  solution = meta_df.copy()
  solution['position'] = predictions

  # calculate strategy return
  solution['strategy_returns'] = (
      solution['risk_free_rate'] * (1-solution['position'])+solution['position'] * solution['forward_returns']
  )

  # exess returns
  strat_excess = solution['strategy_returns'] - solution['risk_free_rate']
  market_excess = solution['forward_returns'] - solution['risk_free_rate']

  strat_mean = strat_excess.mean()
  strat_std = strat_excess.std()
  market_std = market_excess.std()

  # safety check
  if strat_std == 0: return 0

  # sharpe ratio
  sharpe = (strat_mean/strat_std) * np.sqrt(252)

  # valatility penality
  vol_ratio = strat_std/market_std
  vol_penalty = 1 + max(0,vol_ratio - 1.2)

  # return penality
  diff = (market_excess.mean()-strat_mean) * 100 *252
  return_penalty = 1 + (diff**2)/100 if diff >0 else 1

  return sharpe/ (vol_penalty * return_penalty)


# 3. Grid search for best scaling factor 'k'
# strategy : Position = 1.0 + (Prediction *k)

best_score = -999
best_k = 0

print(f"{'Scale(k)':<10} | {'Score':<10}")
print("-" * 25)

for k in [0,10,30,50,80,100,150,200]:
  # calculate position with current k
  raw_pos = 1.0 + (valid_preds * k)
  current_pos = np.clip(raw_pos,0,2)

  score = calculate_sharpe_score(current_pos,valid_meta)

  print(f"{k:<10} | {score: .4f}")

  if score > best_score:
    best_score = score
    best_k = k

print("-" * 25)

print(f"Optimal scalar (k) found: {best_k}")





Scale(k)   | Score     
-------------------------
0          |  0.8158
10         |  0.8102
30         |  0.7984
50         |  0.7859
80         |  0.7668
100        |  0.7547
150        |  0.7445
200        |  0.6922
-------------------------
Optimal scalar (k) found: 0


In [None]:
# 1. Predict on Test Set
test_preds = model.predict(test_df[feature_cols])

# 2. Apply Optimal Strategy
# Position = 1.0 (Market) + (Alpha_Prediction * Best_K)
final_positions = 1.0 + (test_preds * best_k)

# 3. Clip to valid range [0, 2]
final_positions = np.clip(final_positions, 0, 2)

# 4. Create DataFrame
submission = pd.DataFrame({
    'date_id': test_df['date_id'],
    'prediction': final_positions
})

# 5. Save
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)

print("Submission file created successfully!")
print(submission.head())

# Optional: Download immediately if running in Colab
# from google.colab import files
# files.download(submission_path)

Submission file created successfully!
   date_id  prediction
0     8980         1.0
1     8981         1.0
2     8982         1.0
3     8983         1.0
4     8984         1.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>