In [1]:
# !pip install pandas numpy matplotlib seaborn scikit-learn lightgbm shap


In [2]:
# 1) Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# 2) Import CSV data

train = pd.read_csv('C:\\Users\\Dinesh\\Documents\\Python Scripts\\burnout-datathon-ieeecsmuj\\train.csv')
test = pd.read_csv('C:\\Users\\Dinesh\\Documents\\Python Scripts\\burnout-datathon-ieeecsmuj\\test.csv')
val = pd.read_csv('C:\\Users\\Dinesh\\Documents\\Python Scripts\\burnout-datathon-ieeecsmuj\\val.csv')
sample_submission = pd.read_csv('C:\\Users\\Dinesh\\Documents\\Python Scripts\\burnout-datathon-ieeecsmuj\\sample_submission.csv')



In [4]:
'''
# 3) Briefly visualize the data
print(train.head())
print(train.describe())
sns.histplot(train['Lap_Time_Seconds'], kde=True)
plt.title("Distribution of Lap Time Seconds")
plt.show()
'''


'\n# 3) Briefly visualize the data\nprint(train.head())\nprint(train.describe())\nsns.histplot(train[\'Lap_Time_Seconds\'], kde=True)\nplt.title("Distribution of Lap Time Seconds")\nplt.show()\n'

In [5]:

# 4) Add features
def add_features(df):
    df['LapTime_Estimate'] = df['Circuit_Length_km'] / df['Avg_Speed_kmh'] * 3600
    df['Points_per_Year'] = df['Championship_Points'] / (df['years_active'] + 1)
    df['Finish_Rate'] = df['finishes'] / (df['starts'] + 1)
    df['Podium_Rate'] = df['podiums'] / (df['starts'] + 1)
    df['Win_Rate'] = df['wins'] / (df['starts'] + 1)
    df['Avg_Temp'] = (df['Ambient_Temperature_Celsius'] + df['Track_Temperature_Celsius']) / 2
    return df

train = add_features(train)
test = add_features(test)

TARGET = 'Lap_Time_Seconds'
DROP_COLS = [
    'Unique ID', 'rider_name', 'team_name', 'bike_name',
    'circuit_name', 'points', 'position'
]

X = train.drop(DROP_COLS + [TARGET], axis=1)
y = train[TARGET]
X_test = test.drop(DROP_COLS, axis=1)

# Handle missing values and categorical encoding
all_data = pd.concat([X, X_test], axis=0)
all_data.fillna(-1, inplace=True)
cat_cols = all_data.select_dtypes(include='object').columns
for col in cat_cols:
    all_data[col] = all_data[col].astype('category').cat.codes

X = all_data.iloc[:len(train)]
X_test = all_data.iloc[len(train):]


In [6]:

# 5) Train-test split and model training with early stopping
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

params = {
    'bagging_fraction': 0.6307598531013435,
    'feature_fraction': 0.9100021508065013,
    'lambda_l1': 9.794332222263648,
    'lambda_l2': 1.5125116822949636,
    'min_child_samples': 76,
    'min_child_weight': 6.517089323249935,
    'num_leaves': 92,
    'metric': 'rmse',
    'objective': 'regression',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.25457289118658527,
    'bagging_freq': 5
}

model = lgb.train(params, lgb_train, num_boost_round=20000,
                  valid_sets=[lgb_train, lgb_val],
                  callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=25)
    ])


Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 10.824	valid_1's rmse: 10.8598
[50]	training's rmse: 10.2332	valid_1's rmse: 10.2862
[75]	training's rmse: 9.72054	valid_1's rmse: 9.78304
[100]	training's rmse: 9.2067	valid_1's rmse: 9.28111
[125]	training's rmse: 8.73666	valid_1's rmse: 8.81529
[150]	training's rmse: 8.32399	valid_1's rmse: 8.4122
[175]	training's rmse: 7.95797	valid_1's rmse: 8.04966
[200]	training's rmse: 7.61613	valid_1's rmse: 7.70812
[225]	training's rmse: 7.2798	valid_1's rmse: 7.37421
[250]	training's rmse: 6.94777	valid_1's rmse: 7.05142
[275]	training's rmse: 6.62318	valid_1's rmse: 6.73175
[300]	training's rmse: 6.34098	valid_1's rmse: 6.45071
[325]	training's rmse: 6.06698	valid_1's rmse: 6.17816
[350]	training's rmse: 5.79612	valid_1's rmse: 5.90997
[375]	training's rmse: 5.55355	valid_1's rmse: 5.67072
[400]	training's rmse: 5.31546	valid_1's rmse: 5.43462
[425]	training's rmse: 5.09525	valid_1's rmse: 5.21722
[450]	train

In [7]:

# 6) Predict and evaluate
val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds)
print("Validation RMSE:", rmse)


Validation RMSE: 0.047584439894809344


In [8]:
'''
# 7) SHAP explainability
explainer = shap.Explainer(model, X)
shap_values = explainer(X)
shap.summary_plot(shap_values, X)
'''

'\n# 7) SHAP explainability\nexplainer = shap.Explainer(model, X)\nshap_values = explainer(X)\nshap.summary_plot(shap_values, X)\n'

In [9]:

# 8) Predict on test set and create output
preds = model.predict(X_test)
out = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': preds
})

out.to_csv("LGBMsolution.csv", index=False)
print(out.head())

   Unique ID  Lap_Time_Seconds
0     288307         90.378858
1     704288        104.147832
2     951491         86.339750
3    2591721        109.712509
4    1202653         99.216205
