****EDA Using ydata_profiling***

In [None]:
# Make the necessary imports
import pandas as pd
from ydata_profiling import ProfileReport

# Load the data
df = pd.read_csv("/kaggle/input/train-dataset/train.csv",na_values = "\\N")



In [None]:
# Generate the report
profile = ProfileReport(df,title="F1 Racer Profile")

# Save the report to .html
profile.to_file("f1racer_report.html")

All the analysis and the visualisaation is in the f1racer_report.html file

Handling missing values

In [None]:
df.head(15)

In [None]:
print(df.isnull().sum())

In [None]:
df.info()

In [None]:
import pandas as pd

# Assuming df is your DataFrame
# Drop columns with 100% missing values
cols_to_drop = ['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time']
df.drop(columns=cols_to_drop, inplace=True)

# Mean/Median imputation for numerical columns
df['position_x'].fillna(df['position_x'].mean(), inplace=True)
df['timetaken_in_millisec'].fillna(df['timetaken_in_millisec'].median(), inplace=True)
df['max_speed'].fillna(df['max_speed'].mean(), inplace=True)

# Mode imputation for categorical columns
df['driver_code'].fillna(df['driver_code'].mode()[0], inplace=True)

# Forward fill for time series columns
df['time_x'].fillna(method='ffill', inplace=True)
df['time_y'].fillna(method='ffill', inplace=True)

# Create a flag for missing values in 'rank' and then fill with the median
df['rank_missing'] = df['rank'].isnull()
df['rank'].fillna(df['rank'].median(), inplace=True)

# Predictive imputation can be done using models (not shown here for simplicity)

print(df.isnull().sum())  # Check remaining missing values


In [None]:
import pandas as pd

# Assuming df is your DataFrame

# Handle `fastestLap` and `fastestLapTime`
# Example: Fill with a default value -1
df['fastestLap'].fillna(-1, inplace=True)
df['fastestLapTime'].fillna('00:00:00', inplace=True)  # Assuming the time format is HH:MM:SS

# Handle `time_y` using forward fill as an example
df['time_y'].fillna(method='ffill', inplace=True)

# Handle `driver_num` using mode imputation as an example
df['driver_num'].fillna(df['driver_num'].mode()[0], inplace=True)

print(df.isnull().sum())  # Check remaining missing values


In [None]:
# Handle `time_y` using a combination of forward fill and backward fill
df['time_y'].fillna(method='ffill', inplace=True)
df['time_y'].fillna(method='bfill', inplace=True)

In [None]:
print(df.isnull().sum())

In [None]:
df['number'].fillna(df['number'].mode()[0], inplace=True)
print(df.isnull().sum())

Handling duplicate values

In [None]:
import pandas as pd

# Assuming df is your DataFrame

# Identify all duplicate rows
duplicates = df.duplicated()
print(duplicates.sum())  # Count of duplicate rows
print(df[duplicates])

No duplicate rows found

Removing unnecessary columns

In [None]:
import pandas as pd

# Assuming df is your DataFrame
cols_to_drop = [
    'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
    'quali_date', 'quali_time', 'sprint_date', 'sprint_time',
    'url_x', 'url_y', 'url', 'time_y', 'driver_num', 'raceId_y', 'positionText_x', 'positionText_y', 'driverRef',
    'fastestLap', 'fastestLapTime'
]

# Check which columns actually exist in the DataFrame
existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]

# Drop only the existing columns
df.drop(columns=existing_cols_to_drop, inplace=True)

# Print the remaining columns to verify
print(df.columns)

# Check for remaining missing values
print(df.isnull().sum())


In [None]:
df.describe()

Handling the outliers using IQR technique

In [None]:
# Get list of numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

In [None]:
import pandas as pd

# Assuming df is your DataFrame with numerical columns as identified

# List of numerical columns
numerical_columns = [
    'resultId', 'racerId', 'driverId', 'constructorId', 'number', 'grid', 'position_x', 'positionOrder', 'points', 'laps', 'timetaken_in_millisec', 'rank', 'max_speed', 'statusId', 'year', 'round', 'circuitId', 'driverStandingsId', 'points_y', 'position', 'wins', 'result_driver_standing'
]

# Calculate Q1, Q3, and IQR for each numerical column
Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Handling outliers: Replace or remove
for col in numerical_columns:
    # Replace outliers with the nearest non-outlier value
    df[col] = df[col].mask(df[col] < lower_bound[col], df[col].quantile(0.05))
    df[col] = df[col].mask(df[col] > upper_bound[col], df[col].quantile(0.95))

# Print the updated DataFrame to verify changes
print(df.describe())

# Optionally, you can save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_race_data_with_outliers_handled.csv', index=False)


In [None]:
unique_values = df['time_x'].unique()

# Print the unique values
print(unique_values)


In [None]:
df1 = pd.read_csv("/kaggle/working/cleaned_race_data_with_outliers_handled.csv")
df1.info()

In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


train_df = pd.read_csv("/kaggle/input/myyydata/cleaned_race_data_with_outliers_handled.csv")
val_df = pd.read_csv("/kaggle/input/mydataset/validation/validation.csv")
test_df = pd.read_csv("/kaggle/input/mydataset/test/test.csv")

In [None]:
train_df = pd.read_csv("/kaggle/input/myyydata/cleaned_race_data_with_outliers_handled.csv")

In [None]:
# Define feature columns and target variable
X_train = train_df.drop(columns=['position'])
y_train = train_df['position']

X_val = val_df.drop(columns=['position'])
y_val = val_df['position']


In [None]:
# Define numerical and categorical columns
numerical_cols = ["points","timetaken_in_millisec","max_speed","points_y","wins"]
categorical_cols = X_train.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Define the preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a ColumnTransformer to apply the preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming `preprocessor`, `X_train`, `y_train`, `X_val`, `y_val` are defined and prepared appropriately

# Define Linear Regression model
linear_reg = LinearRegression()

# Dictionary to store the best Linear Regression model and its performance
best_linear_model = None
best_linear_score = None

print("Training Linear Regression...")

# Create a pipeline that combines the preprocessor with Linear Regression
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', linear_reg)
])

# Fit the model
linear_pipeline.fit(X_train, y_train)

# Evaluate on the validation set
print("Evaluating Linear Regression on the validation set...")
y_val_pred_linear = linear_pipeline.predict(X_val)
rmse_val_linear = np.sqrt(mean_squared_error(y_val, y_val_pred_linear))
print(f"RMSE on validation set for Linear Regression: {rmse_val_linear}")
