In [1]:
import pandas as pd


# --- 1. Load Data ---

In [2]:
try:
    race_df = pd.read_csv('csv/f1_race_data_raw.csv')
    standings_df = pd.read_csv('csv/f1_standings_data_raw.csv')
    print("Data loaded successfully.")
    print(f"Race DF shape: {race_df.shape}")
    print(f"Standings DF shape: {standings_df.shape}")
except FileNotFoundError:
    print("Error: CSV files not found. Make sure you ran the data collection script.")
except Exception as e:
    print(f"Error loading data: {e}")

Data loaded successfully.
Race DF shape: (880, 12)
Standings DF shape: (66, 18)


In [3]:
print("\nInitial Race DF Info:")
race_df.info()
print("\nSample Race DF Head:")
print(race_df.head())
print("\nSample Standings DF Head:")
print(standings_df[['Season', 'position', 'points', 'wins', 'DriverId', 'constructorNames']].head())


Initial Race DF Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Season           880 non-null    int64  
 1   Round            880 non-null    int64  
 2   EventName        880 non-null    object 
 3   Driver           880 non-null    object 
 4   DriverId         880 non-null    object 
 5   TeamId           880 non-null    object 
 6   GridPosition     878 non-null    float64
 7   FinishPosition   878 non-null    float64
 8   PositionsGained  874 non-null    float64
 9   PointsGained     880 non-null    float64
 10  AvgLapTimeSec    860 non-null    float64
 11  Status           880 non-null    object 
dtypes: float64(5), int64(2), object(5)
memory usage: 82.6+ KB

Sample Race DF Head:
   Season  Round           EventName Driver        DriverId        TeamId  \
0    2023      1  Bahrain Grand Prix    VER  max_verstappen      r

# --- 2. Data Cleaning & Preparation ---

In [4]:
cols_to_check = ['FinishPosition', 'GridPosition', 'PointsGained']
original_rows = len(race_df)
race_df.dropna(subset=cols_to_check, how='any', inplace=True)
print(f"Dropped {original_rows - len(race_df)} rows with NaNs in essential columns (e.g., FinishPosition).")

Dropped 2 rows with NaNs in essential columns (e.g., FinishPosition).


In [5]:
numeric_cols = ['GridPosition', 'FinishPosition', 'PositionsGained', 'PointsGained', 'AvgLapTimeSec']
for col in numeric_cols:
    race_df[col] = pd.to_numeric(race_df[col], errors='coerce')
print("Converted numeric columns, coercing errors.")

Converted numeric columns, coercing errors.


In [6]:
max_grid_per_round = race_df.groupby(['Season', 'Round'])['GridPosition'].transform('max')
race_df['PitLaneStart'] = (race_df['GridPosition'] == 0).astype(int)

race_df['GridPositionCalc'] = race_df['GridPosition']

race_df.loc[race_df['GridPosition'] == 0, 'GridPositionCalc'] = race_df.loc[race_df['GridPosition'] == 0, 'GridPosition'].index.map(lambda idx: max_grid_per_round[idx] + 1)

print("Handled GridPosition=0 (PitLaneStart), created GridPositionCalc.")

Handled GridPosition=0 (PitLaneStart), created GridPositionCalc.


In [7]:
race_df['PositionsGained'] = race_df['GridPositionCalc'] - race_df['FinishPosition']
print("Recalculated PositionsGained.")

Recalculated PositionsGained.


In [8]:
print("Imputing NaNs in AvgLapTimeSec with race median...")
race_df['AvgLapTimeSec'] = race_df.groupby(['Season', 'Round'])['AvgLapTimeSec'].transform(lambda x: x.fillna(x.median()))
global_median_laptime = race_df['AvgLapTimeSec'].median()
race_df['AvgLapTimeSec'].fillna(global_median_laptime, inplace=True)
print(f"Remaining AvgLapTimeSec NaNs filled with global median: {global_median_laptime:.3f}")

Imputing NaNs in AvgLapTimeSec with race median...
Remaining AvgLapTimeSec NaNs filled with global median: 92.340


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['AvgLapTimeSec'].fillna(global_median_laptime, inplace=True)


In [9]:
median_pos_gained = 0 # Or calculate race_df['PositionsGained'].median() if preferred
race_df['PositionsGained'].fillna(median_pos_gained, inplace=True)
print(f"Imputed PositionsGained NaNs with {median_pos_gained}.")

Imputed PositionsGained NaNs with 0.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['PositionsGained'].fillna(median_pos_gained, inplace=True)


In [10]:
median_grid_pos = race_df['GridPositionCalc'].median()
race_df['GridPositionCalc'].fillna(median_grid_pos, inplace=True)
print(f"Imputed GridPositionCalc NaNs with median: {median_grid_pos:.1f}")

Imputed GridPositionCalc NaNs with median: 10.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['GridPositionCalc'].fillna(median_grid_pos, inplace=True)


In [11]:
median_finish_pos = race_df['FinishPosition'].median()
race_df['FinishPosition'].fillna(median_finish_pos, inplace=True)
print(f"Imputed FinishPosition NaNs with median: {median_finish_pos:.1f}")

Imputed FinishPosition NaNs with median: 10.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['FinishPosition'].fillna(median_finish_pos, inplace=True)


In [12]:
finished_statuses = ['Finished', '+1 Lap', '+2 Laps', '+3 Laps', '+4 Laps', '+5 Laps', '+6 Laps'] # Add more if needed
race_df['DidFinish'] = race_df['Status'].apply(lambda x: 1 if x in finished_statuses else 0)
print("Created 'DidFinish' column.")

Created 'DidFinish' column.


In [13]:
race_df.sort_values(by=['Season', 'Round', 'DriverId'], inplace=True)
print("Sorted DataFrame by Season, Round, DriverId.")

Sorted DataFrame by Season, Round, DriverId.


# --- 3. Create Cumulative/Expanding Features ---

In [14]:
grouped = race_df.groupby(['Season', 'DriverId'])

race_df['CumulativePoints'] = grouped['PointsGained'].transform(lambda x: x.shift(1).expanding(min_periods=1).sum()).fillna(0)

race_df['AvgPoints'] = grouped['PointsGained'].transform(lambda x: x.shift(1).expanding(min_periods=1).mean()).fillna(0)

race_df['AvgFinishPos'] = grouped['FinishPosition'].transform(lambda x: x.shift(1).expanding(min_periods=1).mean()).fillna(race_df['FinishPosition'].median()) # Fill initial NaNs

race_df['AvgGridPos'] = grouped['GridPositionCalc'].transform(lambda x: x.shift(1).expanding(min_periods=1).mean()).fillna(race_df['GridPositionCalc'].median())

race_df['AvgPosGained'] = grouped['PositionsGained'].transform(lambda x: x.shift(1).expanding(min_periods=1).mean()).fillna(0)

race_df['RaceCount'] = grouped.cumcount()

race_df['FinishRate'] = grouped['DidFinish'].transform(lambda x: x.shift(1).expanding(min_periods=1).mean()).fillna(1.0) # Assume 100% initially

race_df['DNFCount'] = grouped['DidFinish'].transform(lambda x: (1 - x.shift(1)).expanding(min_periods=1).sum()).fillna(0)

race_df['StdDevFinishPos'] = grouped['FinishPosition'].transform(lambda x: x.shift(1).expanding(min_periods=2).std()).fillna(0) # Need min 2 periods for std dev

race_df['Wins'] = grouped['FinishPosition'].transform(lambda x: (x.shift(1) == 1).expanding(min_periods=1).sum()).fillna(0)
race_df['Podiums'] = grouped['FinishPosition'].transform(lambda x: (x.shift(1) <= 3).expanding(min_periods=1).sum()).fillna(0)
race_df['Top10s'] = grouped['FinishPosition'].transform(lambda x: (x.shift(1) <= 10).expanding(min_periods=1).sum()).fillna(0)

print("Calculated expanding features (cumulative points, avg finish, avg grid, finish rate, etc.).")

Calculated expanding features (cumulative points, avg finish, avg grid, finish rate, etc.).


# --- 4. Create Rolling Features (Recent Form) ---

In [15]:
window_size = 5

race_df[f'PointsLast{window_size}'] = grouped['PointsGained'].transform(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).sum()).fillna(0)

race_df[f'AvgFinishLast{window_size}'] = grouped['FinishPosition'].transform(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean()).fillna(race_df['FinishPosition'].median())

race_df[f'AvgGridLast{window_size}'] = grouped['GridPositionCalc'].transform(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean()).fillna(race_df['GridPositionCalc'].median())

race_df[f'FinishRateLast{window_size}'] = grouped['DidFinish'].transform(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean()).fillna(1.0)

race_df[f'DNFCountLast{window_size}'] = grouped['DidFinish'].transform(lambda x: (1 - x.shift(1)).rolling(window=window_size, min_periods=1).sum()).fillna(0)

print(f"Calculated rolling features for last {window_size} races.")

Calculated rolling features for last 5 races.


# --- 5. Add Lagged Features (Previous Season Performance) ---

In [16]:
standings_lag = standings_df[['Season', 'DriverId', 'position', 'points', 'wins']].copy()
standings_lag.rename(columns={
    'position': 'PrevSeasonRank',
    'points': 'PrevSeasonPoints',
    'wins': 'PrevSeasonWins'
}, inplace=True)

standings_lag['Season'] = standings_lag['Season'] + 1

race_df = pd.merge(race_df, standings_lag, on=['Season', 'DriverId'], how='left')

race_df['PrevSeasonRank'].fillna(25, inplace=True)
race_df['PrevSeasonPoints'].fillna(0, inplace=True)
race_df['PrevSeasonWins'].fillna(0, inplace=True)

print("Merged previous season standings data.")

Merged previous season standings data.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['PrevSeasonRank'].fillna(25, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  race_df['PrevSeasonPoints'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

# --- 6. Define the Target Variable ---

In [17]:
target_standings = standings_df[['Season', 'DriverId', 'position', 'points']].copy()
target_standings.rename(columns={
    'position': 'FinalRank',
    'points': 'FinalPoints'
}, inplace=True)

final_df = pd.merge(race_df, target_standings, on=['Season', 'DriverId'], how='left')

final_df['FinalRank'].fillna(final_df['FinalRank'].max() + 1, inplace=True)
final_df['FinalPoints'].fillna(0, inplace=True)

print("Merged final season standings as target variables (FinalRank, FinalPoints).")

Merged final season standings as target variables (FinalRank, FinalPoints).


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['FinalRank'].fillna(final_df['FinalRank'].max() + 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['FinalPoints'].fillna(0, inplace=True)


# --- Final Check ---

In [18]:
feature_cols = [
    # Identifiers
    'Season', 'Round', 'EventName', 'DriverId', 'TeamId',
    # Raw Current Race Results (Careful about using these directly as features!)
    # 'GridPosition', 'FinishPosition', 'PositionsGained', 'PointsGained', 'AvgLapTimeSec', 'Status',
    # Base Calculated
    'RaceCount', 'DidFinish', 'PitLaneStart',
    # Cumulative Features
    'CumulativePoints', 'AvgPoints', 'AvgFinishPos', 'AvgGridPos', 'AvgPosGained',
    'FinishRate', 'DNFCount', 'StdDevFinishPos', 'Wins', 'Podiums', 'Top10s',
    # Rolling Features
    f'PointsLast{window_size}', f'AvgFinishLast{window_size}', f'AvgGridLast{window_size}',
    f'FinishRateLast{window_size}', f'DNFCountLast{window_size}',
    # Lagged Features
    'PrevSeasonRank', 'PrevSeasonPoints', 'PrevSeasonWins',
    # Target Variables
    'FinalRank', 'FinalPoints'
]
final_df = final_df[feature_cols]

print(f"Final DataFrame shape: {final_df.shape}")
print("\nFinal DataFrame Head:")
print(final_df.head())
print("\nFinal DataFrame Info:")
final_df.info()
print("\nChecking for remaining NaNs in final features:")
print(final_df.isnull().sum())

final_df.to_csv('csv/f1_engineered_features_2023.csv', index=False)
print("\nEngineered features saved to f1_engineered_features_2023.csv")

Final DataFrame shape: (1756, 29)

Final DataFrame Head:
   Season  Round           EventName DriverId        TeamId  RaceCount  \
0    2023      1  Bahrain Grand Prix    albon      williams          0   
1    2023      1  Bahrain Grand Prix    albon      williams          0   
2    2023      1  Bahrain Grand Prix    albon      williams          1   
3    2023      1  Bahrain Grand Prix    albon      williams          1   
4    2023      1  Bahrain Grand Prix   alonso  aston_martin          0   

   DidFinish  PitLaneStart  CumulativePoints  AvgPoints  ...  PointsLast5  \
0          1             0               0.0        0.0  ...          0.0   
1          1             0               0.0        0.0  ...          0.0   
2          1             0               1.0        1.0  ...          1.0   
3          1             0               1.0        1.0  ...          1.0   
4          1             0               0.0        0.0  ...          0.0   

   AvgFinishLast5  AvgGridLast5  Fi