In [64]:
import pandas as pd

In [65]:
Constructor_Perfomance = pd.read_csv('Constructor_Performance.csv')
Constructor_Rankings = pd.read_csv('Constructor_Rankings.csv')
Driver_Details = pd.read_csv('Driver_Details.csv')
Driver_Rankings = pd.read_csv('Driver_Rankings.csv')
Lap_Timing = pd.read_csv('Lap_Timings.csv')
Pit_Stops_Records = pd.read_csv('Pit_Stop_Records.csv')
Qualifying_Results = pd.read_csv('Qualifying_Results.csv')
Race_Result = pd.read_csv('Race_Results.csv')
Race_Schedule = pd.read_csv('Race_Schedule.csv')
Race_Status = pd.read_csv('Race_Status.csv')
Sprint_Race_Result = pd.read_csv('Sprint_Race_Results.csv')
Team_Details = pd.read_csv('Team_Details.csv')
Track_Information = pd.read_csv('Track_Information.csv')

In [66]:
# Gabung Race_Result dengan Race_Schedule biar dapat kolom 'year'
race_with_year = Race_Result.merge(Race_Schedule[['raceId', 'year']], on='raceId')

# Filter tahun 2024
race_2024 = race_with_year[race_with_year['year'] == 2024]

# Ambil constructorId unik
constructor_2024 = race_2024['constructorId'].unique()

# Ambil nama tim dari Team_Details berdasarkan constructorId
team_2024 = Team_Details[Team_Details['constructorId'].isin(constructor_2024)][['constructorId', 'name']].drop_duplicates().sort_values('constructorId').reset_index(drop=True)

team_2024


Unnamed: 0,constructorId,name
0,1,McLaren
1,3,Williams
2,6,Ferrari
3,9,Red Bull
4,15,Sauber
5,117,Aston Martin
6,131,Mercedes
7,210,Haas F1 Team
8,214,Alpine F1 Team
9,215,RB F1 Team


# Filtering + Preprocessing

In [67]:
dataframes = {
  "Constructor_Perfomance": Constructor_Perfomance,
  "Constructor_Rankings": Constructor_Rankings,
  "Driver_Details": Driver_Details,
  "Driver_Rankings": Driver_Rankings,
  "Lap_Timing": Lap_Timing,
  "Pit_Stops_Records": Pit_Stops_Records,
  "Qualifying_Results": Qualifying_Results,
  "Race_Result": Race_Result,
  "Race_Schedule": Race_Schedule,
  "Race_Status": Race_Status,
  "Sprint_Race_Result": Sprint_Race_Result,
  "Team_Details": Team_Details,
  "Track_Information": Track_Information
}

for name, df in dataframes.items():
  print(f"{name} columns:")
  print(df.columns.tolist())
  print("-" * 50)

Constructor_Perfomance columns:
['constructorResultsId', 'raceId', 'constructorId', 'points', 'status']
--------------------------------------------------
Constructor_Rankings columns:
['constructorStandingsId', 'raceId', 'constructorId', 'points', 'position', 'positionText', 'wins']
--------------------------------------------------
Driver_Details columns:
['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob', 'nationality', 'url']
--------------------------------------------------
Driver_Rankings columns:
['driverStandingsId', 'raceId', 'driverId', 'points', 'position', 'positionText', 'wins']
--------------------------------------------------
Lap_Timing columns:
['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds']
--------------------------------------------------
Pit_Stops_Records columns:
['raceId', 'driverId', 'stop', 'lap', 'time', 'duration', 'milliseconds']
--------------------------------------------------
Qualifying_Results columns:
['qualifyI

Race_Schedule filtered

In [68]:
Race_Schedule.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [69]:
Race_Schedule_Filtered = Race_Schedule.copy()
Race_Schedule_Filtered.columns

Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url',
       'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
       'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],
      dtype='object')

In [70]:
cutoff_index1 = Race_Schedule_Filtered.columns.get_loc('date')
Race_Schedule_Filtered = Race_Schedule_Filtered.iloc[:, :cutoff_index1]
Race_Schedule_Filtered.head()

Unnamed: 0,raceId,year,round,circuitId,name
0,1,2009,1,1,Australian Grand Prix
1,2,2009,2,2,Malaysian Grand Prix
2,3,2009,3,17,Chinese Grand Prix
3,4,2009,4,3,Bahrain Grand Prix
4,5,2009,5,4,Spanish Grand Prix


Constructor Rankings Filtered

In [71]:
Constructor_Rankings_Filtered = Constructor_Rankings.drop(columns='positionText', axis = 1)
Constructor_Rankings_Filtered.head()

Unnamed: 0,constructorStandingsId,raceId,constructorId,points,position,wins
0,1,18,1,14.0,1,1
1,2,18,2,8.0,3,0
2,3,18,3,9.0,2,0
3,4,18,4,5.0,4,0
4,5,18,5,2.0,5,0


Driver Details Filtered

In [72]:
Driver_Details_Filtered = Driver_Details.copy()
cutoff_index2 = Driver_Details_Filtered.columns.get_loc('number')
Driver_Details_Filtered = Driver_Details_Filtered.iloc[:, :cutoff_index2]
Driver_Details_Filtered.head()

Unnamed: 0,driverId,driverRef
0,1,hamilton
1,2,heidfeld
2,3,rosberg
3,4,alonso
4,5,kovalainen


Pit Stop Records Filtered

In [73]:
Pit_Stops_Records_Filtered = Pit_Stops_Records.copy()
Pit_Stops_Records_Filtered = Pit_Stops_Records_Filtered.drop(columns='duration', axis = 1)
Pit_Stops_Records_Filtered.head()

Unnamed: 0,raceId,driverId,stop,lap,time,milliseconds
0,841,153,1,1,17:05:23,26898
1,841,30,1,1,17:05:52,25021
2,841,17,1,11,17:20:48,23426
3,841,4,1,12,17:22:34,23251
4,841,13,1,13,17:24:10,23842


Lap Timing Filtered

In [74]:
Lap_Timing_Filterd = Lap_Timing.copy()
Lap_Timing_Filterd = Lap_Timing_Filterd.drop(columns='time', axis = 1)
Lap_Timing_Filterd.head()

Unnamed: 0,raceId,driverId,lap,position,milliseconds
0,841,20,1,1,98109
1,841,20,2,1,93006
2,841,20,3,1,92713
3,841,20,4,1,92803
4,841,20,5,1,92342


In [75]:
Lap_Timing_Filterd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575029 entries, 0 to 575028
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   raceId        575029 non-null  int64
 1   driverId      575029 non-null  int64
 2   lap           575029 non-null  int64
 3   position      575029 non-null  int64
 4   milliseconds  575029 non-null  int64
dtypes: int64(5)
memory usage: 21.9 MB


Qualifying Result Filtered

In [76]:
Qualifying_Results_Filtered = Qualifying_Results.copy()
def time_to_milliseconds(time_str):
  try:
    if pd.isna(time_str):
        return None
    m, s = time_str.split(':')
    return int(m) * 60 * 1000 + float(s) * 1000
  except:
    return None

In [77]:
Qualifying_Results_Filtered['q1'] = Qualifying_Results['q1'].apply(time_to_milliseconds)
Qualifying_Results_Filtered['q2'] = Qualifying_Results['q2'].apply(time_to_milliseconds)
Qualifying_Results_Filtered['q3'] = Qualifying_Results['q3'].apply(time_to_milliseconds)
Qualifying_Results_Filtered.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,86572.0,85187.0,86714.0
1,2,18,9,2,4,2,86103.0,85315.0,86869.0
2,3,18,5,1,23,3,85664.0,85452.0,87079.0
3,4,18,13,6,2,4,85994.0,85691.0,87178.0
4,5,18,2,2,3,5,85960.0,85518.0,87236.0


Race Result Filtered

In [78]:
Race_Result_Filtered = Race_Result.copy()
Race_Result_Filtered = Race_Result_Filtered.drop(columns='positionText', axis = 1)
Race_Result_Filtered = Race_Result_Filtered.drop(columns='laps', axis  = 1)
Race_Result_Filtered = Race_Result_Filtered.drop(columns='time', axis  = 1)
Race_Result_Filtered = Race_Result_Filtered.drop(columns='fastestLap', axis  = 1)
Race_Result_Filtered = Race_Result_Filtered.drop(columns='fastestLapSpeed', axis  = 1)
Race_Result_Filtered = Race_Result_Filtered.drop(columns='positionOrder', axis  = 1)

In [79]:
Race_Result_Filtered['fastestLapTime'] = Race_Result['fastestLapTime'].apply(time_to_milliseconds)
Race_Result_Filtered.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,points,milliseconds,rank,fastestLapTime,statusId
0,1,18,1,1,22,1,1,10.0,5690616,2,87452.0,1
1,2,18,2,2,3,5,2,8.0,5696094,3,87739.0,1
2,3,18,3,3,7,7,3,6.0,5698779,5,88090.0,1
3,4,18,4,4,5,11,4,5.0,5707797,7,88603.0,1
4,5,18,5,1,23,3,5,4.0,5708630,1,87418.0,1


Race Status Filtered

In [80]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

Race_Status_Filtered = Race_Status.copy()
Race_Status_Filtered['status'] = le.fit_transform(Race_Status_Filtered['status'])
Race_Status_Filtered.head()

Unnamed: 0,statusId,status
0,1,69
1,2,53
2,3,34
3,4,44
4,5,62


Sprint Race Filtered

In [81]:
Sprint_Race_Result_Filtered = Sprint_Race_Result.copy()
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.drop(columns='positionText', axis = 1)
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.drop(columns='positionOrder', axis = 1)
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.drop(columns='laps', axis = 1)
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.drop(columns='time', axis = 1)
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.drop(columns='fastestLap', axis = 1)

In [82]:
Sprint_Race_Result_Filtered['fastestLapTime'] = Sprint_Race_Result_Filtered['fastestLapTime'].apply(time_to_milliseconds)
Sprint_Race_Result_Filtered.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,points,milliseconds,fastestLapTime,statusId
0,1,1061,830,9,33,2,1,3,1538426,90013.0,1
1,2,1061,1,131,44,1,2,2,1539856,89937.0,1
2,3,1061,822,131,77,3,3,1,1545928,89958.0,1
3,4,1061,844,6,16,4,4,0,1549704,90163.0,1
4,5,1061,846,1,4,6,5,0,1562537,90566.0,1


In [83]:
Team_Details_Filtered = Team_Details.copy()
Team_Details_Filtered = Team_Details_Filtered.drop(columns='constructorRef', axis = 1)
Team_Details_Filtered = Team_Details_Filtered.drop(columns='nationality', axis = 1)
Team_Details_Filtered = Team_Details_Filtered.drop(columns='url', axis = 1)
Team_Details_Filtered.head()

Unnamed: 0,constructorId,name
0,1,McLaren
1,2,BMW Sauber
2,3,Williams
3,4,Renault
4,5,Toro Rosso


# WDC

Merging Race_Schedule & Race_Result

In [84]:
Race_Schedule_Filtered = Race_Schedule_Filtered[Race_Schedule_Filtered['year'] >= 2022]
Race_Schedule_Filtered

Unnamed: 0,raceId,year,round,circuitId,name
1036,1074,2022,1,3,Bahrain Grand Prix
1058,1075,2022,2,77,Saudi Arabian Grand Prix
1059,1076,2022,3,1,Australian Grand Prix
1060,1077,2022,4,21,Emilia Romagna Grand Prix
1061,1078,2022,5,79,Miami Grand Prix
...,...,...,...,...,...
1120,1140,2024,20,32,Mexico City Grand Prix
1121,1141,2024,21,18,São Paulo Grand Prix
1122,1142,2024,22,80,Las Vegas Grand Prix
1123,1143,2024,23,78,Qatar Grand Prix


In [85]:
valid_race_ids = Race_Schedule_Filtered['raceId'].unique()

Race_Result_Filtered = Race_Result_Filtered[Race_Result_Filtered['raceId'].isin(valid_race_ids)]
Race_Result_Filtered = Race_Result_Filtered.merge(
  Race_Schedule_Filtered[['raceId', 'year']], on='raceId', how='left'
)
Race_Result_Filtered

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,points,milliseconds,rank,fastestLapTime,statusId,year
0,25406,1074,844,6,16,1,1,26.0,5853584,1,94570.0,1,2022
1,25407,1074,832,6,55,3,2,18.0,5859182,3,95740.0,1,2022
2,25408,1074,1,131,44,5,3,15.0,5863259,5,96228.0,1,2022
3,25409,1074,847,131,63,9,4,12.0,5864795,6,96302.0,1,2022
4,25410,1074,825,210,20,7,5,10.0,5868338,8,96623.0,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,26520,1132,839,214,31,18,16,0.0,\N,16,90875.0,12,2024
1115,26521,1132,815,9,11,0,17,0.0,\N,6,89707.0,12,2024
1116,26522,1132,855,15,24,14,18,0.0,\N,17,91014.0,12,2024
1117,26523,1132,847,131,63,1,\N,0.0,\N,19,91298.0,34,2024


In [86]:
Race_Result_Filtered['number'] = pd.to_numeric(Race_Result_Filtered['number'])
Race_Result_Filtered['position'] = pd.to_numeric(Race_Result_Filtered['position'], errors='coerce')
Race_Result_Filtered['milliseconds'] = pd.to_numeric(Race_Result_Filtered['milliseconds'], errors='coerce')
Race_Result_Filtered['rank'] = pd.to_numeric(Race_Result_Filtered['rank'], errors='coerce')
Race_Result_Filtered.dtypes

resultId            int64
raceId              int64
driverId            int64
constructorId       int64
number              int64
grid                int64
position          float64
points            float64
milliseconds      float64
rank                int64
fastestLapTime    float64
statusId            int64
year                int64
dtype: object

In [87]:
race_agg = Race_Result_Filtered.groupby(['driverId', 'year']).agg(
  total_points = ('points', 'sum'),
  avg_position = ('position', 'mean'),
  wins = ('position', lambda x: (x==1).sum()),
  podiums = ('position', lambda x: ((x<=3) & (x>0)).sum()),
  races_finished = ('position', lambda x: (x>0).sum())
).reset_index()
race_agg

Unnamed: 0,driverId,year,total_points,avg_position,wins,podiums,races_finished
0,1,2022,233.0,5.380952,0,9,21
1,1,2023,217.0,4.900000,0,6,20
2,1,2024,100.0,5.909091,1,2,11
3,4,2022,81.0,8.647059,0,0,17
4,4,2023,198.0,5.600000,0,8,20
...,...,...,...,...,...,...,...
60,857,2024,112.0,5.750000,0,2,12
61,858,2023,1.0,15.000000,0,0,17
62,858,2024,0.0,16.666667,0,0,9
63,859,2023,2.0,12.200000,0,0,5


merging Sprint_Race

In [88]:
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered[Sprint_Race_Result_Filtered['raceId'].isin(valid_race_ids)]
Sprint_Race_Result_Filtered = Sprint_Race_Result_Filtered.merge(
  Race_Schedule_Filtered[['raceId', 'year']], on='raceId', how='left'
)
Sprint_Race_Result_Filtered

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,points,milliseconds,fastestLapTime,statusId,year
0,61,1077,830,9,1,1,1,8,1839567,79154.0,1,2022
1,62,1077,844,6,16,2,2,7,1842542,79044.0,1,2022
2,63,1077,815,9,11,7,3,6,1844288,79012.0,1,2022
3,64,1077,832,6,55,10,4,5,1857145,79251.0,1,2022
4,65,1077,846,1,4,3,5,4,1864128,80030.0,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...
235,296,1131,858,3,2,15,16,0,1645465,70488.0,1,2024
236,297,1131,848,3,23,0,17,0,1646062,70562.0,1,2024
237,298,1131,822,15,77,18,18,0,1647900,70590.0,1,2024
238,299,1131,807,210,27,17,19,0,1649812,70512.0,1,2024


In [89]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Sprint_Race_Result_Filtered['position'] = pd.to_numeric(Sprint_Race_Result_Filtered['position'], errors='coerce')
Sprint_Race_Result_Filtered['milliseconds'] = pd.to_numeric(Sprint_Race_Result_Filtered['milliseconds'], errors='coerce')
Sprint_Race_Result_Filtered.isnull().sum()

resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position          13
points             0
milliseconds      17
fastestLapTime     8
statusId           0
year               0
dtype: int64

In [90]:
Sprint_Race_Result_Filtered[Sprint_Race_Result_Filtered[['position', 'points', 'milliseconds']].isnull().any(axis=1)]

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,points,milliseconds,fastestLapTime,statusId,year
19,80,1077,855,51,24,0,,0,,,31,2022
38,99,1084,20,117,5,20,19.0,0,,70317.0,130,2022
39,100,1084,4,214,14,8,,0,,,10,2022
59,120,1095,848,3,23,11,,0,,75998.0,31,2022
78,139,1101,852,213,22,16,,0,,,130,2023
79,140,1101,858,3,2,0,,0,,,3,2023
118,179,1110,815,9,11,8,,0,,122516.0,130,2023
119,180,1110,4,117,14,15,,0,,,3,2023
135,196,1115,807,210,27,7,,0,,89307.0,31,2023
136,197,1115,839,214,31,10,,0,,89388.0,31,2023


In [91]:
Sprint_Race_Result_Filtered['position'].fillna(0, inplace=True)
Sprint_Race_Result_Filtered['points'].fillna(0, inplace=True)
Sprint_Race_Result_Filtered['milliseconds'].fillna(0, inplace=True)
Sprint_Race_Result_Filtered['milliseconds'] = Sprint_Race_Result_Filtered['milliseconds'].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Sprint_Race_Result_Filtered['position'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Sprint_Race_Result_Filtered['points'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec

In [92]:
sprint_agg = Sprint_Race_Result_Filtered.groupby(['driverId', 'year']).agg(
  sprint_points = ('points', 'sum'),
  sprint_wins = ('position', lambda x: (x==1).sum()),
  sprint_avg_position = ('position', 'mean')
).reset_index()
sprint_agg

Unnamed: 0,driverId,year,sprint_points,sprint_wins,sprint_avg_position
0,1,2022,7,0,8.333333
1,1,2023,17,0,6.333333
2,1,2024,10,0,8.000000
3,4,2022,0,0,9.000000
4,4,2023,8,0,7.166667
...,...,...,...,...,...
57,857,2023,15,1,7.333333
58,857,2024,12,0,5.000000
59,858,2023,0,0,12.166667
60,858,2024,0,0,14.666667


In [93]:
sprint_agg.isnull().sum()

driverId               0
year                   0
sprint_points          0
sprint_wins            0
sprint_avg_position    0
dtype: int64

merging race result and sprint race 

In [94]:
full_agg = race_agg.merge(sprint_agg, on=['driverId', 'year'], how='left')
full_agg.fillna({'sprint_points': 0, 'sprint_wins': 0, 'sprint_avg_position': full_agg['sprint_avg_position'].mean()}, inplace=True)
full_agg

Unnamed: 0,driverId,year,total_points,avg_position,wins,podiums,races_finished,sprint_points,sprint_wins,sprint_avg_position
0,1,2022,233.0,5.380952,0,9,21,7.0,0.0,8.333333
1,1,2023,217.0,4.900000,0,6,20,17.0,0.0,6.333333
2,1,2024,100.0,5.909091,1,2,11,10.0,0.0,8.000000
3,4,2022,81.0,8.647059,0,0,17,0.0,0.0,9.000000
4,4,2023,198.0,5.600000,0,8,20,8.0,0.0,7.166667
...,...,...,...,...,...,...,...,...,...,...
60,857,2024,112.0,5.750000,0,2,12,12.0,0.0,5.000000
61,858,2023,1.0,15.000000,0,0,17,0.0,0.0,12.166667
62,858,2024,0.0,16.666667,0,0,9,0.0,0.0,14.666667
63,859,2023,2.0,12.200000,0,0,5,0.0,0.0,0.000000


making label

In [95]:
driver_ranks_merged = Driver_Rankings.merge(Race_Schedule_Filtered[['raceId', 'year']], on='raceId', how='left')

last_available_races = driver_ranks_merged.groupby('year')['raceId'].max().reset_index()

# Ambil ranking dari race-race tersebut
final_driver_ranking = Driver_Rankings[Driver_Rankings['raceId'].isin(last_available_races['raceId'])]
final_driver_ranking = final_driver_ranking.merge(Race_Schedule_Filtered[['raceId', 'year']], on='raceId', how='left')
final_driver_ranking = final_driver_ranking[['driverId', 'year', 'position']]
final_driver_ranking['is_top5'] = final_driver_ranking['position'].apply(lambda x: 1 if x <= 5 else 0)
final_driver_ranking

Unnamed: 0,driverId,year,position,is_top5
0,844,2022,2,1
1,832,2022,5,1
2,1,2022,6,0
3,847,2022,4,1
4,825,2022,13,0
...,...,...,...,...
60,839,2024,18,0
61,842,2024,15,0
62,822,2024,21,0
63,858,2024,20,0


In [96]:
Race_Schedule_Filtered['year'].value_counts().sort_index()

year
2022    22
2023    22
2024    24
Name: count, dtype: int64

Final Dataset

In [97]:
WDC_dataset = full_agg.merge(final_driver_ranking[['driverId', 'year', 'is_top5']], on=['driverId', 'year'], how='inner')
WDC_dataset.head()

Unnamed: 0,driverId,year,total_points,avg_position,wins,podiums,races_finished,sprint_points,sprint_wins,sprint_avg_position,is_top5
0,1,2022,233.0,5.380952,0,9,21,7.0,0.0,8.333333,0
1,1,2023,217.0,4.9,0,6,20,17.0,0.0,6.333333,1
2,1,2024,100.0,5.909091,1,2,11,10.0,0.0,8.0,0
3,4,2022,81.0,8.647059,0,0,17,0.0,0.0,9.0,0
4,4,2023,198.0,5.6,0,8,20,8.0,0.0,7.166667,1


In [98]:
WDC_dataset.columns

Index(['driverId', 'year', 'total_points', 'avg_position', 'wins', 'podiums',
       'races_finished', 'sprint_points', 'sprint_wins', 'sprint_avg_position',
       'is_top5'],
      dtype='object')

declaring target for spliting and testing

In [99]:
X = WDC_dataset.drop(columns=['driverId', 'year', 'is_top5'])
y = WDC_dataset['is_top5']

In [100]:
from sklearn.model_selection import train_test_split

# Misalnya kamu punya X dan y udah siap
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

RANDOM FOREST Implementation

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
all_y_true = []
all_y_pred = []

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  # Terapkan SMOTE hanya di train set
  smote = SMOTE(random_state=42)
  X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

  # Inisiasi dan train model
  rf_model = RandomForestClassifier(random_state=42)
  rf_model.fit(X_train_res, y_train_res)

  # Predict test set
  y_pred = rf_model.predict(X_test)

  accuracies.append(accuracy_score(y_test, y_pred))
  all_y_true.extend(y_test)
  all_y_pred.extend(y_pred)

print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(classification_report(all_y_true, all_y_pred))

Average Accuracy: 0.9692
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        50
           1       0.93      0.93      0.93        15

    accuracy                           0.97        65
   macro avg       0.96      0.96      0.96        65
weighted avg       0.97      0.97      0.97        65



In [104]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_proba = rf_model.predict_proba(X_test)[:, 1]

mae = mean_absolute_error(y_test, y_proba)
mse = mean_squared_error(y_test, y_proba)
r2 = r2_score(y_test, y_proba)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² Score: {r2}")

MAE: 0.07769230769230771
MSE: 0.02636153846153846
R² Score: 0.8514966666666667


XGBoost Implementation

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [106]:
import xgboost as xgb

model = xgb.XGBClassifier(
  objective='binary:logistic',
  eval_metric='logloss',
  use_label_encoder=False,
  random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [107]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Optional, untuk metrics regression-like dari probabilitas
print("MAE:", mean_absolute_error(y_test, y_pred_proba))
print("MSE:", mean_squared_error(y_test, y_pred_proba))
print("R2 Score:", r2_score(y_test, y_pred_proba))

Accuracy: 0.8461538461538461
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.33      1.00      0.50         1

    accuracy                           0.85        13
   macro avg       0.67      0.92      0.70        13
weighted avg       0.95      0.85      0.88        13

MAE: 0.16777919748654732
MSE: 0.13460940626051013
R2 Score: -0.895749568939209


bad r2 score so we're going to tune it

In [108]:
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
  'n_estimators': [50, 100, 150],
  'max_depth': [3, 5, 7],
  'learning_rate': [0.01, 0.1, 0.3],
  'subsample': [0.8, 1],
  'colsample_bytree': [0.8, 1]
}

In [109]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
  estimator=xgb_model,
  param_grid=param_grid,
  scoring='f1',  # bisa diganti 'roc_auc', 'accuracy', dll
  cv=cv,
  verbose=1,
  n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best F1 Score: 1.0


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [110]:
from sklearn.model_selection import cross_val_predict

best_model = grid_search.best_estimator_

y_pred = cross_val_predict(best_model, X, y, cv=cv)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print(classification_report(y, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred_proba))
print("MSE:", mean_squared_error(y_test, y_pred_proba))
print("R2 Score:", r2_score(y_test, y_pred_proba))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.96      0.98      0.97        50
           1       0.93      0.87      0.90        15

    accuracy                           0.95        65
   macro avg       0.94      0.92      0.93        65
weighted avg       0.95      0.95      0.95        65


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



MAE: 0.227831214093245
MSE: 0.08714137108601726
R2 Score: -0.2272411584854126


LSTM Implementation

In [111]:
# Tentukan panjang sequence, misalnya 3 tahun terakhir
sequence_length = 3

# Urutkan berdasarkan driverId dan year
data_sorted = WDC_dataset.sort_values(['driverId', 'year'])

# Ambil fitur yang mau dipakai ke model (tanpa target & ID)
feature_cols = [
  'total_points', 'avg_position', 'wins', 'podiums',
  'races_finished', 'sprint_points', 'sprint_wins', 'sprint_avg_position'
]

# Siapkan X (sequences) dan y (label)
X_sequences = []
y_labels = []

for driver_id, group in data_sorted.groupby('driverId'):
  group = group.reset_index(drop=True)
  
  if len(group) >= sequence_length:
    for i in range(len(group) - sequence_length + 1):
      # Ambil sequence fitur
      seq = group.loc[i:i+sequence_length-1, feature_cols].values
      # Ambil label dari baris terakhir dalam sequence
      label = group.loc[i+sequence_length-1, 'is_top5']
      
      X_sequences.append(seq)
      y_labels.append(label)

X = np.array(X_sequences)
y = np.array(y_labels)

print("X shape:", X.shape)  # Harusnya (num_samples, sequence_length, num_features)
print("y shape:", y.shape)


X shape: (18, 3, 8)
y shape: (18,)


In [112]:
from imblearn.over_sampling import RandomOverSampler

X_flat = X.reshape((X.shape[0], -1))  # reshape dulu karena oversampler butuh 2D
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_flat, y)

# reshape balik
X_resampled = X_resampled.reshape((-1, 3, 8))

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [114]:
from sklearn.utils import class_weight

weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), weights))

In [115]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([
  LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
  Dense(32, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=8, validation_split=0.2, class_weight=class_weights)

  super().__init__(**kwargs)


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 320ms/step - accuracy: 0.2652 - loss: 0.6995 - val_accuracy: 0.0000e+00 - val_loss: 0.7920
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.2652 - loss: 0.6472 - val_accuracy: 0.3333 - val_loss: 0.6936
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.4886 - loss: 0.6121 - val_accuracy: 1.0000 - val_loss: 0.6097
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.5909 - loss: 0.5573 - val_accuracy: 1.0000 - val_loss: 0.5484
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.7348 - loss: 0.5426 - val_accuracy: 1.0000 - val_loss: 0.4970
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.6932 - loss: 0.5248 - val_accuracy: 1.0000 - val_loss: 0.4482
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x25b68dea390>

In [116]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



oke model unable to recognise the 1 variabel , gonna check somethings

In [117]:
import numpy as np
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

{np.int64(0): np.int64(14), np.int64(1): np.int64(4)}


ok LSTM suck , lets make it with random forest

In [118]:
WDC_dataset['year'].value_counts()

year
2022    22
2023    22
2024    21
Name: count, dtype: int64

In [119]:
wdc_2025 = WDC_dataset[WDC_dataset['year'] == 2024].copy()
wdc_2025['year'] = 2025

In [120]:
X_2025 = wdc_2025.drop(columns=['driverId', 'year', 'is_top5'])

In [122]:
y_pred_2025 = rf_model.predict(X_2025)
wdc_2025['predicted_top5'] = y_pred_2025

In [136]:
top5_predictions = wdc_2025[wdc_2025['predicted_top5'] == 1]
top5_final = top5_predictions.sort_values(by='total_points', ascending=False).head(3)
top5_final = top5_final.merge(Driver_Details_Filtered[['driverId', 'driverRef']], on='driverId', how='left')
top5_final[['driverId', 'driverRef']]

Unnamed: 0,driverId,driverRef
0,830,max_verstappen
1,846,norris
2,844,leclerc
