## 1. Setup  
Import core libraries and configure display settings for cleaner output.

In [2]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## 2. Load Raw Data  
Read key CSV files into pandas DataFrames. These datasets cover drivers, constructors, races, results, qualifying sessions, and driver profiles.

In [3]:
DATA_PATH = '../data/raw/'

# Load key datasets
drivers = pd.read_csv(os.path.join(DATA_PATH, 'drivers.csv'))
constructors = pd.read_csv(os.path.join(DATA_PATH, 'constructors.csv'))
races = pd.read_csv(os.path.join(DATA_PATH, 'races.csv'))
results = pd.read_csv(os.path.join(DATA_PATH, 'results.csv'))
qualifying = pd.read_csv(os.path.join(DATA_PATH, 'qualifying.csv'))
F1Drivers = pd.read_csv(os.path.join(DATA_PATH, 'F1Drivers_Dataset.csv'))

## 3. Initial Inspection  
Preview the first few rows of each dataset to understand structure and key fields.

In [4]:
# Display first few rows of each dataset for inspection
print("Drivers:")
display(drivers.head())

print("Constructors:")
display(constructors.head())

print("Races:")
display(races.head())

print("Results:")
display(results.head())

print("Qualifying:")
display(qualifying.head())

print("F1Drivers Dataset:")
display(F1Drivers.head())

Drivers:


Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


Constructors:


Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso


Races:


Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


Results:


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


Qualifying:


Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236


F1Drivers Dataset:


Unnamed: 0,Driver,Nationality,Seasons,Championships,Race_Entries,Race_Starts,Pole_Positions,Race_Wins,Podiums,Fastest_Laps,Points,Active,Championship Years,Decade,Pole_Rate,Start_Rate,Win_Rate,Podium_Rate,FastLap_Rate,Points_Per_Entry,Years_Active,Champion
0,Carlo Abate,Italy,"[1962, 1963]",0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,1960,0.0,0.0,0.0,0.0,0.0,0.0,2,False
1,George Abecassis,United Kingdom,"[1951, 1952]",0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,False,,1950,0.0,1.0,0.0,0.0,0.0,0.0,2,False
2,Kenny Acheson,United Kingdom,"[1983, 1985]",0.0,10.0,3.0,0.0,0.0,0.0,0.0,0.0,False,,1980,0.0,0.3,0.0,0.0,0.0,0.0,2,False
3,Andrea de Adamich,Italy,"[1968, 1970, 1971, 1972, 1973]",0.0,36.0,30.0,0.0,0.0,0.0,0.0,6.0,False,,1970,0.0,0.833333,0.0,0.0,0.0,0.166667,5,False
4,Philippe Adams,Belgium,[1994],0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,False,,1990,0.0,1.0,0.0,0.0,0.0,0.0,1,False


## 4. Missing Value Check  
Assess each dataset for missing values to identify potential cleaning steps.

In [5]:
# Check for missing values

print("Missing values:")
for df_name, df in [('drivers', drivers), ('constructors', constructors), ('races', races), ('results', results), ('qualifying', qualifying), ('F1Drivers', F1Drivers)]:
    print(f"{df_name}: {df.isnull().sum().sum()} missing values")

Missing values:
drivers: 0 missing values
constructors: 0 missing values
races: 0 missing values
results: 0 missing values
qualifying: 68 missing values
F1Drivers: 834 missing values


## 5. Duplicate Check  
Check for duplicate rows in each dataset to ensure data integrity.

In [6]:
# Check for duplicates
print("Duplicate rows:")
for df_name, df in [('drivers', drivers), ('constructors', constructors), ('races', races), ('results', results), ('qualifying', qualifying), ('F1Drivers', F1Drivers)]:
    print(f"{df_name}: {df.duplicated().sum()} duplicates")

Duplicate rows:
drivers: 0 duplicates
constructors: 0 duplicates
races: 0 duplicates
results: 0 duplicates
qualifying: 0 duplicates
F1Drivers: 0 duplicates


## 6. Key Field Review  
Compare driver and constructor IDs across datasets to identify mismatches before merging.

In [7]:
# Check ID consistency across datasets
print("Unique driver IDs in results vs drivers:")

print(set(results['driverId']) - set(drivers['driverId']))

print("Unique constructor IDs in results vs constructors:")
print(set(results['constructorId']) - set(constructors['constructorId']))

Unique driver IDs in results vs drivers:
set()
Unique constructor IDs in results vs constructors:
set()


### Next Steps
- Normalize driver names and IDs
- Merge F1Drivers with drivers.csv
- Create unified race-level dataset
- Begin feature engineering (driver experience, constructor performance, etc.)