In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'f1-analyze-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5298232%2F8808953%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240628%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240628T082656Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3cc91bdda13e67dd6b0392b05482d161da6d977ee714eee500a0ed5d3f36e2d3e1c2229c95a522e7f7cc70863f591533fde8983da6f82a18600d679ee9d3c3427d9612e93dbc044ec195c2a567f001ce780ba9f0aa86309744cb65fe8bd86ea4065c0f3700b52b5da722fd698b6c371cc7b4aa1b8552a28f362b035fe1b9c8b5fb05584409fcc72627ac53271c043dc3ef4317183c30a9dc01e444e921f8e2d5b4396d17163c246f548e1f788b89057bec36680233a7b5417fe8311e90b9eb5c027e14e2ee895295115312bc3654686335442d45b7ec427e6b860b161089fc7b1fead24562d1dbf9bc91baba58c59054a82669aa523f4981cad6ec21f26378fb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading f1-analyze-dataset, 57574882 bytes compressed
Downloaded and uncompressed: f1-analyze-dataset
Data source import complete.


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings("ignore")
# Load the datasets
X = pd.read_csv("/kaggle/input/f1-analyze-dataset/train.csv")
p = pd.read_csv("/kaggle/input/f1-analyze-dataset/test.csv")
v = pd.read_csv("/kaggle/input/f1-analyze-dataset/validation.csv")

columns_to_drop = [
    'fp1_date','position','number','driverRef','driver_num','driver_code','forename','surname',
    'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
    'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'url_x', 'url_y',
    'url', 'positionText_y', 'positionText_x', 'position_x', 'grand_prix',
    'status', 'nationality_y', 'constructorRef', 'company', 'dob', 'nationality',
    'result_driver_standing', 'resultId', 'raceId_y', 'timetaken_in_millisec',
    'fastestLapTime', 'time_x', 'fastestLap', 'max_speed', 'date', 'time_y'
]
t1 = X['position']
X.drop(columns=columns_to_drop, inplace=True)

t2 = v['position']
v.drop(columns=columns_to_drop, inplace=True)

X['rank'] = X['rank'].replace('\\N', np.nan)
X['rank'] = pd.to_numeric(X['rank'], errors='coerce')

# Convert to nullable integer type
X['rank'] = X['rank'].astype('Int64')

pipe = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, tol=0.001, random_state=42)),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize features
])

preprocessor = ColumnTransformer(
    transformers=[
         ('pi',pipe, X.columns) # Apply transformation pipeline to features
    ])


In [None]:
# Define a hyperparameter search space for BaggingCLassifier and XGBClassifier
space = {
        'n_estimators' : hp.choice('n_estimators', [500, 550 ,600 ,650 ,700,750,800]),
        'max_depth': hp.choice('max_depth', range(2, 13)),
        'learning_rate': hp.uniform('learning_rate', 0.0005, 0.002),
        'min_child_weight': hp.uniform('min_child_weight', 1, 18),
        'gamma': hp.loguniform('gamma', low=np.log(0.001), high=np.log(5)),  # Gamma from 0 to 5 (log-uniform)
        'reg_alpha': hp.loguniform('reg_alpha', low=np.log(0.0001), high=np.log(0.8)),  # Alpha from 0 to 0.8 (log-uniform)
        'reg_lambda': hp.loguniform('reg_lambda', low=np.log(1), high=np.log(5)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'colsample_bynode': hp.uniform('colsample_bynode', 0.6, 1.0),
        'colsample_bylevel': hp.uniform('colsample_bylevel', 0.6, 1.0)
    }

In [None]:
X_pre = preprocessor.fit_transform(X)
V_pre = preprocessor.transform(v)

In [None]:
X_pre.shape

(2830101, 15)

In [None]:
def objective(space):

    # Flatten the target variable y
    y_data  = t1.values.ravel()
    # Define the BaggingClassifier with XGBClassifier as the base estimator
    model = XGBRegressor(device = 'cuda' ,
                        random_state=42, **space, objective='reg:squarederror',n_jobs = -1)

    # Fit the model
    model.fit(X_pre, y_data)
    y2_pred = model.predict(V_pre)
    rmse = np.sqrt(mean_squared_error(t2.values.ravel(),y2_pred))

    # Return loss (negative mean accuracy) and optimization status
    return {'loss': rmse, 'status': STATUS_OK}

# Run hyperparameter optimization using Hyperopt
trials = Trials()
best_params = fmin(objective, space, rstate=np.random.default_rng(42), algo=tpe.suggest,
                   max_evals=100, trials=trials)
# Print best hyperparameters found
print("Best hyperparameters:", best_params)


100%|██████████| 100/100 [1:01:47<00:00, 37.07s/trial, best loss: 2.7162468485240243]
Best hyperparameters: {'colsample_bylevel': 0.7829573243380741, 'colsample_bynode': 0.8409512818863321, 'colsample_bytree': 0.9380717847017378, 'gamma': 1.6608611376082092, 'learning_rate': 0.0018821333068856992, 'max_depth': 9, 'min_child_weight': 17.888457276697828, 'n_estimators': 5, 'reg_alpha': 0.010806799409259934, 'reg_lambda': 4.552090852516414}


In [None]:
columns_to_drop_test = [
    'fp1_date','number','driverRef','driver_num','driver_code','forename','surname',
    'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
    'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'url_x', 'url_y',
    'url', 'positionText_x', 'position_x', 'grand_prix',
    'status', 'nationality_y', 'constructorRef', 'company', 'dob', 'nationality',
    'result_driver_standing', 'resultId', 'raceId_y', 'timetaken_in_millisec',
    'fastestLapTime', 'time_x', 'fastestLap', 'max_speed', 'date', 'time_y'
]
ID = p['result_driver_standing']
p.drop(columns = columns_to_drop_test, inplace = True)
final = preprocessor.transform(p)

In [None]:
best_params = { 'colsample_bylevel': 0.7829573243380741, 'colsample_bynode': 0.8409512818863321,
               'colsample_bytree': 0.9380717847017378, 'gamma': 1.6608611376082092, 'learning_rate': 0.0018821333068856992, 'max_depth': 11,
                'min_child_weight': 17.888457276697828, 'n_estimators': 750, 'reg_alpha': 0.010806799409259934, 'reg_lambda': 4.552090852516414}
model = XGBRegressor(device = 'cuda' ,
                    random_state=42, **best_params, objective='reg:squarederror',n_jobs = -1)

# Fit the model
model.fit(X_pre, t1.values.ravel())
test_pred = model.predict(final)

In [None]:
predictions_df = pd.DataFrame({
    'position' : test_pred,
    'result_driver_standing': ID
})

# Save results to CSV file
predictions_df.to_csv("submission_1.csv", index=False)