<a href="https://colab.research.google.com/github/23i368/Shalini/blob/main/Formula1Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
from google.colab import drive
drive.mount('/content/drive/')
file_path = "/content/drive/My Drive/DPL_Datasets/results.csv"
df = pd.read_csv(file_path)
print(df.head())



MessageError: Error: credential propagation was unsuccessful

In [None]:
# Load all CSV files
circuits = pd.read_csv("/content/drive/My Drive/DPL_Datasets/circuits.csv")
constructor_results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructor_results.csv")
constructor_standings = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructor_standings.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")
driver_standings = pd.read_csv("/content/drive/My Drive/DPL_Datasets/driver_standings.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")
lap_times = pd.read_csv("/content/drive/My Drive/DPL_Datasets/lap_times.csv")
pit_stops = pd.read_csv("/content/drive/My Drive/DPL_Datasets/pit_stops.csv")
qualifying = pd.read_csv("/content/drive/My Drive/DPL_Datasets/qualifying.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
seasons = pd.read_csv("/content/drive/My Drive/DPL_Datasets/seasons.csv")
sprint_results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/sprint_results.csv")
status = pd.read_csv("/content/drive/My Drive/DPL_Datasets/status.csv")

# **Data preprocessing**

In [None]:
# Check first few rows of each dataset
datasets = [circuits, constructor_results, constructor_standings, constructors, driver_standings,
            drivers, lap_times, pit_stops, qualifying, races, results, seasons, sprint_results]

for name, data in zip(["circuits", "constructor_results", "constructor_standings", "constructors",
                        "driver_standings", "drivers", "lap_times", "pit_stops", "qualifying",
                        "races", "results", "seasons", "sprint_results"], datasets):
    print(f"\n{name}:\n", data.info(), "\n", data.head())


In [None]:
#checking for missing values
for name, data in zip(["circuits", "constructor_results", "constructor_standings", "constructors",
                        "driver_standings", "drivers", "lap_times", "pit_stops", "qualifying",
                        "races", "results", "seasons", "sprint_results"], datasets):
    print(f"\n{name} Missing Values:\n", data.isnull().sum())


In [None]:
for name, data in zip(["circuits", "constructor_results", "constructor_standings", "constructors",
                        "driver_standings", "drivers", "lap_times", "pit_stops", "qualifying",
                        "races", "results", "seasons", "sprint_results"], datasets):
    print(f"\n{name} Missing Values:\n", data.isnull().sum())


In [None]:
#converting time columns to seconds for further processing
def convert_to_seconds(time):
  if type(time)==float:
    return time
  if time == "\\N": return np.nan
  time = str(time).strip()
  if ":" in time:
    minutes, seconds = map(float, time.split(':'))
    return minutes * 60 + seconds
qualifying["q1"] = qualifying["q1"].apply(convert_to_seconds)
qualifying["q1"] = qualifying["q1"].fillna(qualifying["q1"].median())
qualifying["q2"] = qualifying["q2"].apply(convert_to_seconds)
qualifying["q2"] = qualifying["q2"].fillna(qualifying["q2"].median())
qualifying["q3"] = qualifying["q3"].apply(convert_to_seconds)
qualifying["q3"] = qualifying["q3"].fillna(qualifying["q3"].median())
print(qualifying[["q1","q2", "q3"]].head())


In [None]:
# Convert date columns to datetime format
races['date'] = pd.to_datetime(races['date'])
seasons['year'] = seasons['year'].astype(int)

# Convert categorical data to category type
drivers['nationality'] = drivers['nationality'].astype('category')
constructors['nationality'] = constructors['nationality'].astype('category')
results['position'] = pd.to_numeric(results['position'], errors='coerce')


In [None]:
# Checking for duplicates
for name, data in zip(["circuits", "constructor_results", "constructor_standings", "constructors",
                        "driver_standings", "drivers", "lap_times", "pit_stops", "qualifying",
                        "races", "results", "seasons", "sprint_results"], datasets):
    print(f"\n{name} Duplicates:", data.duplicated().sum())


In [None]:
# First, merge results with races and circuits
race_results = results.merge(races, on="raceId", how="left", suffixes=("_results", "_races"))
race_results = race_results.merge(circuits, on="circuitId", how="left", suffixes=("_races", "_circuits"))

# Merge driver details
race_results = race_results.merge(drivers, on="driverId", how="left", suffixes=("_circuits", "_drivers"))

# Merge constructor details
race_results = race_results.merge(constructors, on="constructorId", how="left", suffixes=("_drivers", "_constructors"))

# Merge driver standings
race_results = race_results.merge(driver_standings, on=["raceId", "driverId"], how="left", suffixes=("_constructors", "_driverStandings"))

# Merge constructor standings
race_results = race_results.merge(constructor_standings, on=["raceId", "constructorId"], how="left", suffixes=("_driverStandings", "_constructorStandings"))

# Merge qualifying data
race_results = race_results.merge(qualifying, on=["raceId", "driverId", "constructorId"], how="left", suffixes=("_constructorStandings", "_qualifying"))

# Merge results again (to ensure no missing values)
race_results = race_results.merge(results, on=["raceId", "driverId", "constructorId"], how="left", suffixes=("_qualifying", "_finalResults"))

# Merge lap times (may contain multiple entries per driver per race)
race_results = race_results.merge(lap_times, on=["raceId", "driverId"], how="left", suffixes=("_finalResults","_lapTimes"))

# Merge pit stops (also multiple entries per driver per race)
race_results = race_results.merge(pit_stops, on=["raceId", "driverId"], how="left")

# Merge status (race finishing status)
race_results = race_results.merge(status, left_on="statusId_finalResults", right_on="statusId", how="left")

# Drop redundant statusId column after merging
race_results.drop(columns=["statusId_finalResults", "statusId"], inplace=True)


In [None]:
race_results.columns

# **Feature Engineering**

In [None]:
# Driver Consistency (Avg finishing position)
driver_avg_finish = race_results.groupby("driverId")["position_finalResults"].mean().reset_index()
driver_avg_finish.rename(columns={"position_finalResults": "avg_finishing_position"}, inplace=True)

driver_avg_qualifying = race_results.groupby("driverId")["grid_finalResults"].mean().reset_index()
driver_avg_qualifying.rename(columns={"grid_results": "avg_qualifying_position"}, inplace=True)

# Team Strength (Avg constructor points)
constructor_avg_points = race_results.groupby("constructorId")["points_finalResults"].mean().reset_index()
constructor_avg_points.rename(columns={"points_finalResults": "avg_constructor_points"}, inplace=True)


# Track Complexity (Avg overtaking positions gained/lost)
race_results["grid_change"] = race_results["grid_finalResults"] - race_results["position_finalResults"]
track_difficulty = race_results.groupby("circuitId")["grid_change"].mean().reset_index()
track_difficulty.rename(columns={"grid_change": "avg_positions_gained"}, inplace=True)

# Merge newly calculated features into the final dataset
race_results = race_results.merge(driver_avg_finish, on="driverId", how="left")
race_results = race_results.merge(constructor_avg_points, on="constructorId", how="left")
race_results = race_results.merge(track_difficulty, on="circuitId", how="left")


In [None]:
# Drop unnecessary columns (handling renamed columns)
race_results.drop(columns=[
    "url_results", "url_races", "url_circuits", "url_drivers",
    "url_constructors", "url_driverStandings", "url_constructorStandings", "url_qualifying"
], inplace=True, errors="ignore")

# Drop rows with remaining missing values
race_results.dropna(inplace=True)

# Save the cleaned dataset
#race_results.to_csv("cleaned_f1_data.csv", index=False)


In [None]:
race_results.columns

In [None]:
import numpy as np
#Team Strength
constructor_avg_points = race_results.groupby("constructorId")["points_finalResults"].mean().reset_index()
constructor_avg_points.rename(columns={"points_results": "avg_constructor_points"}, inplace=True)

# Count total races and finished races (DNF is when position is NULL or 'DNF')
constructor_races = race_results.groupby("constructorId").size().reset_index(name="total_races")

constructor_wins = constructor_standings.groupby("constructorId")["wins"].sum().reset_index(name="wins")
# Merge race counts and wins for each constructor
constructor_reliability = constructor_races.merge(constructor_wins, on="constructorId", how="left")
constructor_reliability.fillna(0, inplace=True)
# Compute reliability score as the percentage of races won
constructor_reliability["win_reliability_score"] = np.where(
    constructor_reliability["total_races"] > 0,
    constructor_reliability["wins"] / constructor_reliability["total_races"],
    0  # Ensure constructors with no races get a 0 reliability score
)

# Cap values between 0 and 1 for consistency
constructor_reliability["win_reliability_score"] = constructor_reliability["win_reliability_score"].clip(0, 1)

# Convert to percentage for better readability
constructor_reliability["win_reliability_score"] *= 100

race_results = race_results.merge(constructor_avg_points, on="constructorId", how="left")
race_results = race_results.merge(constructor_reliability, on="constructorId", how="left")


In [None]:
# Ensure grid and positionOrder are numeric
race_results["grid"] = pd.to_numeric(race_results["grid_finalResults"], errors="coerce")
race_results["positionOrder"] = pd.to_numeric(race_results["positionOrder_finalResults"], errors="coerce")

# Calculate positions gained/lost per driver per race
race_results["positions_gained"] = race_results["grid"] - race_results["positionOrder_finalResults"]

# Compute the average overtakes per circuit
track_overtaking = race_results.groupby("circuitId")["positions_gained"].mean().reset_index()
track_overtaking.rename(columns={"positions_gained": "avg_positions_gained"}, inplace=True)

# Merge with race_results
race_results = race_results.merge(track_overtaking, on="circuitId", how="left")

track_variability = race_results.groupby("circuitId")["positions_gained"].std().reset_index()
track_variability.rename(columns={"positions_gained": "position_variability"}, inplace=True)

# Merge with race_results
race_results = race_results.merge(track_variability, on="circuitId", how="left")

driver_variability = race_results.groupby("driverId")["positions_gained"].std().reset_index()
driver_variability.rename(columns={"positions_gained": "driver_position_variability"}, inplace=True)

# Merge with race_results
race_results = race_results.merge(driver_variability, on="driverId", how="left")


In [None]:
race_results.columns

In [None]:
#Performance correlation
import seaborn as sns
import matplotlib.pyplot as plt

# Select numeric columns for correlation analysis
correlation_matrix = race_results[[
    "avg_constructor_points", "positions_gained",
    "avg_positions_gained_y", "driver_position_variability"
]].corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Performance Metric Correlations")
plt.show()


In [None]:
#constructor trends over years
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Aggregate constructor points over time
constructor_trends = race_results.groupby(["year", "name"])["points_finalResults_y"].sum().reset_index()

# Identify the champion constructor for each year (highest points)
champions = constructor_trends.loc[constructor_trends.groupby("year")["points_finalResults_y"].idxmax()]

# Apply rolling average for smoother trends
constructor_trends["points_smooth"] = constructor_trends.groupby("name")["points_finalResults_y"].transform(lambda x: x.rolling(3, min_periods=1).mean())

# Filter to show only top constructors (avoid clutter)
constructor_totals = constructor_trends.groupby("name")["points_finalResults_y"].sum()
top_constructors = constructor_totals[constructor_totals > 100].index  # Keep teams with >100 total points
constructor_trends = constructor_trends[constructor_trends["name"].isin(top_constructors)]

# Plot
plt.figure(figsize=(14, 7))
sns.lineplot(x="year", y="points_smooth", hue="name", data=constructor_trends, marker="o", linewidth=2, alpha=0.85)

# Annotate the champion constructor each year
for _, row in champions.iterrows():
    plt.text(row["year"], row["points_finalResults_y"] + 10, row["name"],
             fontsize=9, rotation=90, ha="right", color="black")

# Customizations
plt.title("Constructor Points Over the Years (With Champions)", fontsize=14, fontweight="bold")
plt.xlabel("Year", fontsize=12)
plt.ylabel("Total Points (Smoothed)", fontsize=12)
plt.xticks(rotation=90)  # Rotate for better readability
plt.yticks(fontsize=10)
plt.grid(True, linestyle="--", alpha=0.6)

# Improve legend placement
plt.legend(title="Constructor", bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=10, ncol=2)

# Show the plot
plt.show()



In [None]:
#driver dominance
import matplotlib.pyplot as plt
import seaborn as sns

driver_points_history = race_results.groupby(["year", "driverId"])["points_finalResults_y"].sum().reset_index()
driver_points_history = driver_points_history.merge(drivers[["driverId", "forename"]], on="driverId", how="left")

plt.figure(figsize=(14, 7))

# Line plot for driver performance per season
sns.lineplot(x="year", y="points_finalResults_y", hue="forename", data=driver_points_history,
             marker="o", linewidth=2, alpha=0.8, palette="tab10")

plt.title("Historical Driver Dominance (Total Points Per Season)", fontsize=14, fontweight="bold")
plt.xlabel("Year", fontsize=12)
plt.ylabel("Total Points", fontsize=12)

plt.legend(title="Driver", bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=10,ncol=5)

# Highlighting season champions
season_champions = driver_points_history.loc[driver_points_history.groupby("year")["points_finalResults_y"].idxmax()]
for _, row in season_champions.iterrows():
    plt.text(row["year"], row["points_finalResults_y"] + 5, row["forename"],
             fontsize=9, color="black", ha="center", rotation=90, fontweight="bold")

plt.grid(True, linestyle="--", alpha=0.6)
plt.show()



In [None]:
#strategic pitshop
import matplotlib.pyplot as plt
import seaborn as sns

# Group by race and compute average pit stops
pit_stop_analysis = race_results.groupby(["raceId"])["stop"].mean().reset_index()
pit_stop_analysis.rename(columns={"stop": "avg_pit_stops"}, inplace=True)

# Line Plot
plt.figure(figsize=(14, 6))
sns.lineplot(data=pit_stop_analysis, x="raceId", y="avg_pit_stops", marker="o", linestyle="-", color="b")
plt.title("Trend of Average Pit Stops per Race")
plt.xlabel("Race ID")
plt.ylabel("Average Pit Stops")
plt.grid(True)
plt.show()
# Get the race with the maximum average pit stops
max_pit_stop_race = pit_stop_analysis.loc[pit_stop_analysis["avg_pit_stops"].idxmax()]
print(max_pit_stop_race)

# Fetch the average points for that race
max_race_points_avg = race_results.loc[race_results["raceId"] == max_pit_stop_race["raceId"], "points_finalResults_y"].mean()
print("Average Points for max pit stop race:", max_race_points_avg)

# Get the race with the minimum average pit stops
min_pit_stop_race = pit_stop_analysis.loc[pit_stop_analysis["avg_pit_stops"].idxmin()]
print(min_pit_stop_race)

# Fetch the average points for that race
min_race_points_avg = race_results.loc[race_results["raceId"] == min_pit_stop_race["raceId"], "points_finalResults_y"].mean()
print("Average Points for min pit stop race:", min_race_points_avg)
##inference: Fewer Pit stops makes the race outcome better.

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="stop", y="positions_gained", data=race_results)
plt.xlabel("Total Pit Stops")
plt.ylabel("Positions Gained/Lost")
plt.title("Distribution of Position Changes by Pit Stops")
plt.axhline(0, color="red", linestyle="--")  # Reference line
plt.xticks(rotation=45)
plt.show()
## more the pit stops the position loss is high

In [None]:
#Team wise pit stop efficiency
team_pit_stops = race_results.groupby("constructorId")["stop"].mean().reset_index()
team_pit_stops.rename(columns={"stop": "avg_pit_stops"}, inplace=True)

plt.figure(figsize=(12, 6))
sns.barplot(x="constructorId", y="avg_pit_stops", data=team_pit_stops, palette="viridis")
plt.xticks(rotation=45)
plt.title("Average Pit Stops per Team")
plt.xlabel("Constructor ID")
plt.ylabel("Average Pit Stops")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))

sns.scatterplot(x="name",
                y="win_reliability_score",
                data=race_results,
                alpha=0.7,
                palette="coolwarm",
                edgecolor="black")

plt.title("Constructor Win Reliability vs. Points")
plt.xlabel("Win Reliability Score (Races Won %)")
plt.ylabel("Final Race Points")

plt.xticks(rotation=90)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1), title="Constructor")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


In [None]:
#Team dynamics
team_performance = race_results.groupby(["constructorId", "driverId"])["points_finalResults_y"].mean().reset_index()

plt.figure(figsize=(12, 6))
sns.boxplot(x="constructorId", y="points_finalResults_y", data=team_performance)
plt.xticks(rotation=45)
plt.title("Points Distribution Within Teams (Teammates Comparison)")
plt.xlabel("Constructor ID")
plt.ylabel("Average Points")
plt.xticks(rotation=90)
plt.show()


In [None]:
#Diver adaptability
plt.figure(figsize=(12, 6))
sns.boxplot(x="circuitId", y="driver_position_variability", data=race_results)
plt.xticks(rotation=45)
plt.title("Driver Performance Variability by Circuit")
plt.xlabel("Circuit ID")
plt.ylabel("Performance Variability (Lower is Better)")
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load Dataset
race_results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")

# Define Features (Ensure 'positionOrder' is the target variable)
# X = race_results.drop(columns=['positionOrder'], errors='ignore')
# y = race_results['positionOrder']
X = race_results[["driverId", "constructorId", "grid", "laps", "fastestLap", "points"]]
y = race_results["positionOrder"]  # Target variable

for col in X:
  print(col,X[col].dtype)
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Convert categorical columns using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])  # Convert categorical values to numbers
    label_encoders[col] = le  # Store label encoder for future reference

# Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:
new_race_data_1 = {
    "driverId": 10,  # Known driver
    "constructorId": 7,
    "grid": 18,  # Mid-grid start
    "laps": 43,  # Full race distance
    "fastestLap": 213.166,  # Fast lap in seconds
    "points": 0  # Scored decent points in previous races
}

new_race_data_2 = {
    "driverId": 999,  # Unseen driver
    "constructorId": 888,  # Unseen constructor
    "grid": 10,
    "laps": 50,
    "fastestLap": 92.345,
    "points": 5  # Low points from past races
}

new_race_data_3 = {
    "driverId": 5,
    "constructorId": 3,
    "grid": 20,  # Last position start
    "laps": 30,  # Retired early or safety car issues
    "fastestLap": 95.567,
    "points": 0  # No previous points
}

new_race_data_4 = {
    "driverId": 1,
    "constructorId": 2,
    "grid": 1,  # Pole position
    "laps": 53,  # More than standard race laps (overtime scenario)
    "fastestLap": 88.999,  # Fastest lap
    "points": 25  # Max points from previous race
}

test_cases = [new_race_data_1, new_race_data_2, new_race_data_3, new_race_data_4]

for i, test_data in enumerate(test_cases):
    test_df = pd.DataFrame([test_data])

    # Encode categorical features safely
    for col, le in label_encoders.items():
        test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # Standardize numerical features
    test_scaled = scaler.transform(test_df)

    # Predict race position
    predicted_position = model.predict(test_scaled)
    print(f"Test Case {i+1}: Predicted Finishing Position = {int(round(predicted_position[0]))}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

# ---- Cross-Validation ----
# Perform 5-fold cross-validation using negative MAE (we take the negative because higher scores mean better performance)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print("Cross-Validation MAE Scores:", -cv_scores)
print("Mean Cross-Validation MAE:", -cv_scores.mean())

# ---- Residual Plot ----
# Calculate residuals (difference between actual and predicted values)
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()


feature_importances = model.feature_importances_
feature_names = X.columns  # Ensure this is the DataFrame used to create X_scaled


indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], align="center")
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], rotation=90)
plt.xlabel("Feature Names")
plt.ylabel("Importance Score")
plt.show()


#**PS - 1**

In [None]:
#PS-1
import pandas as pd

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

results['win'] = results['positionOrder'].apply(lambda x: 1 if x == 1 else 0)
results['podium'] = results['positionOrder'].apply(lambda x: 1 if x <= 3 else 0)

driver_perf = results.groupby('driverId').agg(total_races=('raceId', 'count'),
                                              wins=('win', 'sum'),
                                              podiums=('podium', 'sum'),
                                              total_points=('points', 'sum')).reset_index()
driver_perf['win_ratio'] = driver_perf['wins'] / driver_perf['total_races']
driver_perf = pd.merge(driver_perf, drivers[['driverId','forename','surname']], on='driverId', how='left')
driver_perf['driver'] = driver_perf['forename'] + " " + driver_perf['surname']

constructor_perf = results.groupby('constructorId').agg(total_races=('raceId', 'count'),
                                                         wins=('win', 'sum'),
                                                         podiums=('podium', 'sum')).reset_index()
constructor_perf['win_ratio'] = constructor_perf['wins'] / constructor_perf['total_races']
constructor_perf = pd.merge(constructor_perf, constructors[['constructorId','name']], on='constructorId', how='left')


In [None]:
# Display top 10 dominant drivers
top_drivers = driver_perf.sort_values('win_ratio', ascending=False).head(10)
print("\n🏎️ **Top 10 Dominant Drivers (by Win Ratio)** 🏆")
for i, row in top_drivers.iterrows():
    print(f"{i+1}. {row['driver']} - Win Ratio: {row['win_ratio']:.2%}, Wins: {row['wins']}, Podiums: {row['podiums']}")

# Display top 10 dominant constructors
top_constructors = constructor_perf.sort_values('win_ratio', ascending=False).head(10)
print("\n🏎️ **Top 10 Dominant Constructors (by Win Ratio)** 🏆")
for i, row in top_constructors.iterrows():
    print(f"{i+1}. {row['name']} - Win Ratio: {row['win_ratio']:.2%}, Wins: {row['wins']}, Podiums: {row['podiums']}")


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x="win_ratio", y="driver", data=top_drivers, palette="coolwarm")

plt.title("🏎️ Top 10 Dominant Drivers (Win Ratio)", fontsize=14)
plt.xlabel("Win Ratio", fontsize=12)
plt.ylabel("Driver", fontsize=12)
plt.xlim(0, 1)
plt.grid(axis="x", linestyle="--", alpha=0.5)

for index, value in enumerate(top_drivers["win_ratio"]):
    plt.text(value, index, f"{value:.2%}", va="center", fontsize=10)

plt.show()


plt.figure(figsize=(12, 6))
sns.barplot(x="win_ratio", y="name", data=top_constructors, palette="magma")

plt.title("🏎️ Top 10 Dominant Constructors (Win Ratio)", fontsize=14)
plt.xlabel("Win Ratio", fontsize=12)
plt.ylabel("Constructor", fontsize=12)
plt.xlim(0, 1)
plt.grid(axis="x", linestyle="--", alpha=0.5)

for index, value in enumerate(top_constructors["win_ratio"]):
    plt.text(value, index, f"{value:.2%}", va="center", fontsize=10)

plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")

# Compute driver statistics
driver_stats = results.groupby('driverId').agg(
    career_races=('raceId', 'nunique'),
    wins=('positionOrder', lambda x: (x == 1).sum()),
    podiums=('positionOrder', lambda x: (x <= 3).sum()),
    total_points=('points', 'sum')
).reset_index()

# Compute correlation
corr = driver_stats[['career_races', 'wins', 'podiums', 'total_points']].corr()

# 🔥 Heatmap of correlations
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

plt.title("Correlation: Career Longevity vs Wins, Podiums, and Total Points", fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()



# **PS - 2**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")

# Compute position change
results['position_change'] = results['grid'] - results['positionOrder']

# Calculate average position change per driver
driver_position_gain = results.groupby('driverId').agg(
    avg_position_gain=('position_change', 'mean'),
    total_races=('raceId', 'count')
).reset_index()

# Merge with driver names
driver_position_gain = pd.merge(driver_position_gain, drivers[['driverId', 'forename', 'surname']], on='driverId', how='left')
driver_position_gain = pd.merge(driver_position_gain, results[['driverId','grid']].drop_duplicates(), on='driverId', how='left')
driver_position_gain['driver'] = driver_position_gain['forename'] + " " + driver_position_gain['surname']

# Drivers Who Excel at Gaining Positions
top_movers = driver_position_gain.sort_values('avg_position_gain', ascending=False).head(10)
print("Top 10 Drivers Who Gain the Most Positions:")
print(top_movers[['driver','grid', 'avg_position_gain', 'total_races']])

plt.figure(figsize=(12, 6))
sns.boxplot(x=results["grid"], y=results["position_change"], palette="coolwarm")
plt.title("Distribution of Position Gains/Losses by Grid Position", fontsize=14)
plt.xlabel("Starting Grid Position", fontsize=12)
plt.ylabel("Position Change (Gain/Loss)", fontsize=12)
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


# **PS - 3**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
pit_stops = pd.read_csv("/content/drive/My Drive/DPL_Datasets/pit_stops.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")

# Merge pit stops data with race results
race_pit_data = pd.merge(pit_stops, results[['raceId', 'driverId', 'positionOrder']], on=['raceId', 'driverId'])

# Calculate total pit stops per driver per race
pit_counts = race_pit_data.groupby(['raceId', 'driverId']).agg(total_pit_stops=('stop', 'count')).reset_index()

# Merge pit stop counts with final race positions
race_analysis = pd.merge(pit_counts, results[['raceId', 'driverId', 'positionOrder']], on=['raceId', 'driverId'])

# Analyze the relationship between pit stops and finishing position
plt.figure(figsize=(12, 6))
sns.boxplot(x="total_pit_stops", y="positionOrder", data=race_analysis, palette="coolwarm")
plt.title("Pit Stops vs. Final Race Position")
plt.xlabel("Number of Pit Stops")
plt.ylabel("Final Race Position (Lower is Better)")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()

# Analyze pit stop efficiency
race_pit_data['pit_stop_time'] = pd.to_numeric(race_pit_data['milliseconds']) / 1000  # Convert to seconds
pit_efficiency = race_pit_data.groupby('driverId').agg(avg_pit_time=('pit_stop_time', 'mean'), total_pit_stops=('stop', 'count')).reset_index()

# Merge with driver names
pit_efficiency = pd.merge(pit_efficiency, drivers[['driverId', 'forename', 'surname']], on='driverId', how='left')
pit_efficiency['driver'] = pit_efficiency['forename'] + " " + pit_efficiency['surname']

# Top 10 fastest pit stop drivers
fastest_pit_stops = pit_efficiency.sort_values("avg_pit_time", ascending=True).head(10)
print("Top 10 Fastest Pit Stop Drivers:")
print(fastest_pit_stops[['driver', 'avg_pit_time', 'total_pit_stops']])

# Fastest Pit Stop Drivers
plt.figure(figsize=(12, 6))
sns.barplot(y=fastest_pit_stops["driver"], x=fastest_pit_stops["avg_pit_time"], palette="coolwarm", edgecolor="black")
plt.title("Top 10 Fastest Pit Stop Drivers", fontsize=14)
plt.xlabel("Average Pit Stop Time (Seconds)", fontsize=12)
plt.ylabel("Driver", fontsize=12)
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


# **PS - 4**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")

# Self-join to compare drivers in the same race
rivalry_df = results.merge(results, on="raceId", suffixes=("_A", "_B"))

# Keep only cases where two different drivers competed in the same race
rivalry_df = rivalry_df[rivalry_df["driverId_A"] != rivalry_df["driverId_B"]]

# Determine who finished ahead
rivalry_df["driver_A_wins"] = (rivalry_df["positionOrder_A"] < rivalry_df["positionOrder_B"]).astype(int)

# Group by driver pair to count head-to-head wins
head_to_head = rivalry_df.groupby(["driverId_A", "driverId_B"]).agg(
    races_competed=("raceId", "count"),
    driver_A_wins=("driver_A_wins", "sum")
).reset_index()

# Compute win ratio
head_to_head["driver_A_win_ratio"] = head_to_head["driver_A_wins"] / head_to_head["races_competed"]

# Filter for significant rivalries (minimum races together)
head_to_head = head_to_head[head_to_head["races_competed"] > 100]

# Merge driver names
drivers["driver"] = drivers["forename"] + " " + drivers["surname"]
head_to_head = head_to_head.merge(drivers[["driverId", "driver"]], left_on="driverId_A", right_on="driverId", how="left").rename(columns={"driver": "Driver_A"})
head_to_head = head_to_head.merge(drivers[["driverId", "driver"]], left_on="driverId_B", right_on="driverId", how="left").rename(columns={"driver": "Driver_B"})

# Create driver pair column
head_to_head["driver_pair"] = head_to_head["Driver_A"] + " vs. " + head_to_head["Driver_B"]

# Filter for competitive rivalries (win ratios close to 50-50)
competitive_rivalries = head_to_head[head_to_head["driver_A_win_ratio"] == 0.5]
print("Most Competitive F1 Rivalries (Head-to-Head Win Ratios):")
print(competitive_rivalries[["driver_pair", "races_competed", "driver_A_win_ratio"]])

# Plot head-to-head win ratios
plt.figure(figsize=(12, 6))
sns.barplot(x="driver_pair", y="driver_A_win_ratio", data=competitive_rivalries, palette="coolwarm")

plt.title("Most Competitive F1 Rivalries (Win Ratios Close to 50%)")
plt.xlabel("Driver Rivalry")
plt.ylabel("Win Ratio of Driver A")
plt.xticks(rotation=90, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


# **PS - 5**

In [None]:
import pandas as pd

# Load datasets
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

# Find top-performing driver for each team
top_drivers = results.groupby(["constructorId", "driverId"])["points"].sum().reset_index()
top_drivers = top_drivers.sort_values(["constructorId", "points"], ascending=[True, False])

# Get the best driver per team
top_driver_per_team = top_drivers.groupby("constructorId").first().reset_index()

# Merge driver names
top_driver_per_team = top_driver_per_team.merge(drivers[["driverId", "forename", "surname"]], on="driverId", how="left")
top_driver_per_team["driver_name"] = top_driver_per_team["forename"] + " " + top_driver_per_team["surname"]

# Merge constructor names
top_driver_per_team = top_driver_per_team.merge(constructors[["constructorId", "name"]], on="constructorId", how="left")
top_driver_per_team = top_driver_per_team.rename(columns={"name": "team_name"})

# Display top drivers per team
print(top_driver_per_team[["team_name", "driver_name", "points"]])


In [None]:
# Compute constructor standings before swap
constructor_standings_before = results.groupby("constructorId")["points"].sum().reset_index()
constructor_standings_before = constructor_standings_before.merge(constructors[["constructorId", "name"]], on="constructorId", how="left")
constructor_standings_before = constructor_standings_before.rename(columns={"points": "team_points_before", "name": "team_name"})

# Compute driver standings before swap
driver_standings_before = results.groupby("driverId")["points"].sum().reset_index()
driver_standings_before = driver_standings_before.merge(drivers[["driverId", "forename", "surname"]], on="driverId", how="left")
driver_standings_before["driver_name"] = driver_standings_before["forename"] + " " + driver_standings_before["surname"]
driver_standings_before = driver_standings_before.rename(columns={"points": "driver_points_before"})

# Sort standings before swap
constructor_standings_before = constructor_standings_before.sort_values(by="team_points_before", ascending=False).reset_index(drop=True)
driver_standings_before = driver_standings_before.sort_values(by="driver_points_before", ascending=False).reset_index(drop=True)

# Display before swap standings
print("Constructor Standings Before Swap:")
print(constructor_standings_before[["team_name", "team_points_before"]].head(10))  # Show top 10

print("\nDriver Standings Before Swap:")
print(driver_standings_before[["driver_name", "driver_points_before"]].head(10))  # Show top 10


In [None]:
# Select two teams for the swap
team_A = top_driver_per_team.iloc[0]  # Example: First team
team_B = top_driver_per_team.iloc[1]  # Example: Second team

print(f"Swapping {team_A['driver_name']} (from {team_A['team_name']}) with {team_B['driver_name']} (from {team_B['team_name']})")


In [None]:
# Create a copy to modify
swapped_results = results.copy()

# Swap team associations for the selected drivers
swapped_results.loc[swapped_results["driverId"] == team_A["driverId"], "constructorId"] = team_B["constructorId"]
swapped_results.loc[swapped_results["driverId"] == team_B["driverId"], "constructorId"] = team_A["constructorId"]


In [None]:
# Recalculate team standings
team_standings = swapped_results.groupby("constructorId")["points"].sum().reset_index()
team_standings = team_standings.merge(constructors, on="constructorId", how="left")
team_standings = team_standings.rename(columns={"name": "team_name"}).sort_values("points", ascending=False)

# Recalculate driver standings
driver_standings = swapped_results.groupby("driverId")["points"].sum().reset_index()
driver_standings = driver_standings.merge(drivers, on="driverId", how="left")
driver_standings["driver_name"] = driver_standings["forename"] + " " + driver_standings["surname"]
driver_standings = driver_standings.sort_values("points", ascending=False)

# Compare before and after
print("\nTeam Standings After Swap:")
print(team_standings[["team_name", "points"]].head(10))

print("\nDriver Standings After Swap:")
print(driver_standings[["driver_name", "points"]].head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Merge before and after standings for teams
team_comparison = constructor_standings_before.merge(team_standings, on="team_name", suffixes=("_before", "_after"))

# Merge before and after standings for drivers
driver_comparison = driver_standings_before.merge(driver_standings, on="driver_name", suffixes=("_before", "_after"))

# Limit to Top 10 Teams & Drivers
team_comparison = team_comparison.head(10)
driver_comparison = driver_comparison.head(10)

# Set up figure
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# **Team Standings Bar Plot**
team_comparison_melted = team_comparison.melt(id_vars="team_name", value_vars=["team_points_before", "points"],
                                              var_name="Status", value_name="Total Points")

sns.barplot(x="Total Points", y="team_name", hue="Status", data=team_comparison_melted, palette=["#FF9999", "#66B2FF"], ax=axes[0])
axes[0].set_title("Team Standings: Before vs. After Swap")
axes[0].set_xlabel("Total Points")
axes[0].set_ylabel("Team Name")

# **Driver Standings Bar Plot**
driver_comparison_melted = driver_comparison.melt(id_vars="driver_name", value_vars=["driver_points_before", "points"],
                                                  var_name="Status", value_name="Total Points")

sns.barplot(x="Total Points", y="driver_name", hue="Status", data=driver_comparison_melted, palette=["#FF9999", "#66B2FF"], ax=axes[1])
axes[1].set_title("Driver Standings: Before vs. After Swap")
axes[1].set_xlabel("Total Points")
axes[1].set_ylabel("Driver Name")

# Show the plots
plt.tight_layout()
plt.show()


# **PS - 6**

In [None]:
#Driver Movements & Team Networks:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load Dataset
race_results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")

# Merge race year into race_results
race_results = race_results.merge(races[['raceId', 'year']], on='raceId', how='left')

# Select a year range (e.g., 2000-2010)
start_year, end_year = 2000, 2020
race_results = race_results[(race_results['year'] >= start_year) & (race_results['year'] <= end_year)]

# Get the top performer (highest points) per year within the range
top_drivers_per_year = race_results.loc[race_results.groupby('year')['points'].idxmax()]
top_drivers_per_year = top_drivers_per_year[['year', 'driverId', 'constructorId']]

# Create Graph
G = nx.DiGraph()

# Add edges (Driver -> Team per year)
for _, row in top_drivers_per_year.iterrows():
    year, driver, team = row
    driver_label = f"Driver {driver} ({year})"
    team_label = f"Team {team} ({year})"
    G.add_edge(driver_label, team_label)

# Draw Network Graph using Kamada-Kawai Layout
plt.figure(figsize=(16, 10))
pos = nx.spring_layout(G, seed=42)  # Layout positioning

# Draw nodes with different colors for clarity
nx.draw_networkx_nodes(G, pos, node_size=800, node_color="skyblue", alpha=0.8)
nx.draw_networkx_edges(G, pos, edge_color="gray", alpha=0.6, connectionstyle="arc3,rad=0.1")  # Curved edges
nx.draw_networkx_labels(G, pos, font_size=9, font_weight="bold", verticalalignment='center', bbox=dict(facecolor="white", alpha=0.6, edgecolor="none"))

# Title and display
plt.title(f"Top Performers' Movements Across Teams ({start_year}-{end_year})", fontsize=14)
plt.show()


# **PS - 7**

In [None]:
#Team Performance Comparison:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load Datasets
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

# Merge team (constructor) names
results = results.merge(constructors[['constructorId', 'name']], on='constructorId', how='left')

# Merge race year and circuitId
results = results.merge(races[['raceId', 'year', 'circuitId']], on='raceId', how='left')

# Filter Data for Specific Years (e.g., 2000-2010)
start_year = 2000
end_year = 2010
results = results[(results['year'] >= start_year) & (results['year'] <= end_year)]

# Filter for P1 finishes (Wins)
winners = results[results['positionOrder'] == 1]

# Compute success rate (Without Circuit Factor)
team_win_counts = winners['name'].value_counts()
team_total_races = results['name'].value_counts()
team_win_rates = (team_win_counts / team_total_races).fillna(0) * 100  # Convert to percentage

# ✅ Select Only the **Top 10 Teams** for Clarity
top_teams = team_win_rates.nlargest(10).index
team_win_rates = team_win_rates.loc[top_teams]

# Compute success rate (With Circuit Factor) for Top Teams Only
circuit_wins = winners[winners['name'].isin(top_teams)].groupby(['circuitId', 'name']).size().unstack(fill_value=0)
circuit_total_races = results[results['name'].isin(top_teams)].groupby(['circuitId', 'name']).size().unstack(fill_value=0)
circuit_win_rates = (circuit_wins / circuit_total_races).fillna(0) * 100  # Convert to percentage

# ✅ Compute Average Win Rate **Considering Circuits**
avg_circuit_win_rates = circuit_win_rates.mean(axis=0).reindex(top_teams).fillna(0)

# 🎯 Grouped Bar Chart
x = np.arange(len(top_teams))  # Bar positions
width = 0.4  # Width of bars

plt.figure(figsize=(14, 7))

# Overall Win Rate
plt.bar(x - width/2, team_win_rates.values, width, label="Win Rate without circuit", color="royalblue")

# Circuit-Specific Win Rate
plt.bar(x + width/2, avg_circuit_win_rates.values, width, label="Win Rate with circuit", color="orange")

# Formatting
plt.xticks(x, top_teams, rotation=45, ha="right")
plt.ylabel("Win Rate (%)")
plt.title(f"Win Rates Comparison: With vs Without Circuit Factor ({start_year}-{end_year})")
plt.legend()
plt.show()


# **PS - 8**

In [None]:
#Driver Consistency in Race Performance:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load Dataset
race_results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")

# Keep relevant columns
race_results = race_results[['driverId', 'raceId', 'positionOrder']]

# Define a top finish (Top 3)
race_results['top_finish'] = race_results['positionOrder'] <= 3

# Calculate consistency metrics
driver_stats = race_results.groupby('driverId').agg(
    total_races=('raceId', 'count'),
    top_finishes=('top_finish', 'sum'),
    avg_position=('positionOrder', 'mean'),
    position_std=('positionOrder', 'std')  # Variance in finishing positions
).reset_index()

# Compute Top Finish Rate (% of races in Top 3)
driver_stats['top_finish_rate'] = driver_stats['top_finishes'] / driver_stats['total_races']

# Identify consistent vs. inconsistent drivers
consistent_drivers = driver_stats.sort_values(by='top_finish_rate', ascending=False).head(10)
inconsistent_drivers = driver_stats.sort_values(by='position_std', ascending=False).head(10)

# 📊 Plotting Consistent Drivers
plt.figure(figsize=(12, 6))
plt.bar(consistent_drivers['driverId'].astype(str), consistent_drivers['top_finish_rate'], color='green', alpha=0.7)
plt.xlabel("Driver ID")
plt.ylabel("Top Finish Rate")
plt.title("Top 10 Most Consistent Drivers (Top 3 Finish Rate)")
plt.xticks(rotation=45)
plt.show()


# Select top drivers based on total races
top_drivers = driver_stats.sort_values(by="total_races", ascending=False).head(10)['driverId']

# Filter results for top drivers
top_driver_results = race_results[race_results['driverId'].isin(top_drivers)]

plt.figure(figsize=(12, 6))
sns.boxplot(x=top_driver_results["driverId"].astype(str), y=top_driver_results["positionOrder"], palette="coolwarm")
plt.xlabel("Driver ID")
plt.ylabel("Finishing Position (Lower is Better)")
plt.title("Driver Performance Consistency (Top 10 Drivers by Races)")
plt.gca().invert_yaxis()  # Flip so 1st place is at the top
plt.show()


# **PS - 9**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
laps = pd.read_csv("/content/drive/My Drive/DPL_Datasets/lap_times.csv")
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")

# Merge lap times with race info
lap_data = laps.merge(races[["raceId", "circuitId"]], on="raceId", how="left")

# Merge with constructor details via results dataset
lap_data = lap_data.merge(results[["raceId", "driverId", "constructorId"]], on=["raceId", "driverId"], how="left")
lap_data = lap_data.merge(constructors[["constructorId", "name"]], on="constructorId", how="left")

# Convert lap times to numeric format
lap_data["milliseconds"] = pd.to_numeric(lap_data["milliseconds"], errors="coerce")


In [None]:
# Group by circuit and team to compute average lap time
team_efficiency = lap_data.groupby(["circuitId", "name"])["milliseconds"].mean().reset_index()

# Rename for clarity
team_efficiency = team_efficiency.rename(columns={"name": "team_name", "milliseconds": "avg_lap_time"})

# Sort by efficiency (fastest teams per circuit)
team_efficiency = team_efficiency.sort_values(["circuitId", "avg_lap_time"])


In [None]:
# Find the best (fastest) team for each circuit
best_teams_per_circuit = team_efficiency.loc[team_efficiency.groupby("circuitId")["avg_lap_time"].idxmin()]

# Merge with circuit names
circuits = pd.read_csv("/content/drive/My Drive/DPL_Datasets/circuits.csv")
best_teams_per_circuit = best_teams_per_circuit.merge(circuits[["circuitId", "name"]], on="circuitId", how="left")
best_teams_per_circuit = best_teams_per_circuit.rename(columns={"name": "circuit_name"})


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x="avg_lap_time", y="circuit_name", hue="team_name", data=best_teams_per_circuit, palette="coolwarm")

plt.title("Most Efficient Teams Across Circuits (Fastest Average Lap Times)")
plt.xlabel("Average Lap Time (ms)")
plt.ylabel("Circuit Name")
plt.legend(title="Team")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()


In [None]:
# Compute the overall average lap time per team
team_avg_lap_times = best_teams_per_circuit.groupby("team_name")["avg_lap_time"].mean().reset_index()

# Sort teams by lap time (ascending order for efficiency)
team_avg_lap_times = team_avg_lap_times.sort_values("avg_lap_time")

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x="avg_lap_time", y="team_name", data=team_avg_lap_times, palette="coolwarm")

plt.title("Most Efficient Teams (Lowest Average Lap Time)")
plt.xlabel("Average Lap Time (ms)")
plt.ylabel("Team Name")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()

# Get the most efficient team
most_efficient_team = team_avg_lap_times.iloc[0]
print(f"Most Efficient Team: {most_efficient_team['team_name']} \n Average lap time: {most_efficient_team['avg_lap_time']:.2f} ms")


# **PS - 10**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")

results = results.merge(races[['raceId', 'year']], on='raceId', how='left')
recent_results = results[results['year'] >= 2020]

driver_perf = recent_results.groupby('driverId').agg(
    total_points=('points', 'sum'),
    wins=('positionOrder', lambda x: (x == 1).sum()),
    podiums=('positionOrder', lambda x: (x <= 3).sum()),
    races=('raceId', 'nunique')
).reset_index()

driver_perf = driver_perf.merge(drivers[['driverId', 'forename', 'surname']], on='driverId', how='left')
driver_perf['driver'] = driver_perf['forename'] + " " + driver_perf['surname']
driver_perf['composite_score'] = driver_perf['total_points'] + 50 * driver_perf['wins'] + 20 * driver_perf['podiums']

best_lineup = driver_perf.sort_values('composite_score', ascending=False).head(4)

print("Best Team Lineup (Top 4 Drivers):")
print(best_lineup[['driver', 'total_points', 'wins', 'podiums', 'races', 'composite_score']])




# **PS - 11**

In [None]:
#PREDICTION FOR 2025 SEASONS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

results = results.merge(races[['raceId', 'year']], on='raceId', how='left')
recent_results = results[results['year'] >= 2020]

driver_points = recent_results.groupby(['year', 'driverId'])['points'].sum().reset_index()
driver_avg = driver_points.groupby('driverId')['points'].mean().reset_index().rename(columns={'points': 'avg_points'})
driver_avg = driver_avg.merge(drivers[['driverId', 'forename', 'surname']], on='driverId', how='left')
driver_avg['driver'] = driver_avg['forename'] + " " + driver_avg['surname']
predicted_driver = driver_avg.sort_values('avg_points', ascending=False).iloc[0]

print("Predicted Drivers' Champion for 2025:", predicted_driver['driver'])
print("Average Points:", predicted_driver['avg_points'])

top10_drivers = driver_avg.sort_values('avg_points', ascending=False).head(10)
plt.figure(figsize=(10,6))
sns.barplot(data=top10_drivers, x='avg_points', y='driver', palette='viridis')
plt.title("Top 10 Drivers by Average Points (2020+)\nPredicted Champion: " + predicted_driver['driver'])
plt.xlabel("Average Points")
plt.ylabel("Driver")
plt.show()

constructor_points = recent_results.groupby(['year', 'constructorId'])['points'].sum().reset_index()
constructor_avg = constructor_points.groupby('constructorId')['points'].mean().reset_index().rename(columns={'points': 'avg_points'})
constructor_avg = constructor_avg.merge(constructors[['constructorId', 'name']], on='constructorId', how='left')
predicted_constructor = constructor_avg.sort_values('avg_points', ascending=False).iloc[0]

print("Predicted Constructors' Champion for 2025:", predicted_constructor['name'])
print("Average Points:", predicted_constructor['avg_points'])

top10_constructors = constructor_avg.sort_values('avg_points', ascending=False).head(10)
plt.figure(figsize=(10,6))
sns.barplot(data=top10_constructors, x='avg_points', y='name', palette='magma')
plt.title("Top 10 Constructors by Average Points (2020+)\nPredicted Champion: " + predicted_constructor['name'])
plt.xlabel("Average Points")
plt.ylabel("Constructor")
plt.show()


# **PS - 12**

In [None]:
#Struggling Teams Analysis:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

results = results.merge(races[['raceId','year']], on='raceId', how='left')
recent_results = results[results['year'] >= 2020]
constructor_points = recent_results.groupby(['year','constructorId'])['points'].sum().reset_index()
constructor_avg = constructor_points.groupby('constructorId')['points'].mean().reset_index().rename(columns={'points':'avg_points'})
constructor_avg = constructor_avg.merge(constructors[['constructorId','name']], on='constructorId', how='left')
predicted_struggling = constructor_avg.sort_values('avg_points', ascending=True).iloc[0]

plt.figure(figsize=(10,6))
sns.barplot(data=constructor_avg.sort_values('avg_points'), x='avg_points', y='name', palette='coolwarm')
plt.title("Average Points of Constructors (2020+)\nPredicted Underperformer: " + predicted_struggling['name'])
plt.xlabel("Average Points")
plt.ylabel("Constructor")
plt.show()


# **PS - 13**

In [None]:
#Driver-Specific Track Struggles
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
circuits = pd.read_csv("/content/drive/My Drive/DPL_Datasets/circuits.csv")

results_races = results.merge(races[['raceId','circuitId']], on='raceId', how='left')
driver_circuit = results_races.groupby(['driverId','circuitId'])['positionOrder'].mean().reset_index()
driver_id = 1  # Change as needed
driver_perf = driver_circuit[driver_circuit['driverId'] == driver_id]
driver_perf = driver_perf.merge(circuits[['circuitId','name']], on='circuitId', how='left')
driver_perf = driver_perf.sort_values('positionOrder')

plt.figure(figsize=(24,6))
sns.barplot(data=driver_perf, x='name', y='positionOrder', palette='viridis')
plt.xticks(rotation=90)
plt.title("Driver " + str(driver_id) + " Average Finishing Positions by Circuit")
plt.xlabel("Circuit")
plt.ylabel("Average Finishing Position")
plt.show()


# **PS - 14**

In [None]:
#Championship Retention Probability
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")

results = results.merge(races[['raceId','year']], on='raceId', how='left')
driver_points = results.groupby(['year','driverId'])['points'].sum().reset_index()
season_champs = driver_points.loc[driver_points.groupby('year')['points'].idxmax()].sort_values('year')
season_champs['retained'] = season_champs['driverId'].shift() == season_champs['driverId']
retention_rate = season_champs['retained'].mean()

retained_count = season_champs['retained'].sum()
not_retained_count = len(season_champs) - retained_count
labels = ['Retained', 'Not Retained']
sizes = [retained_count, not_retained_count]

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
plt.title("Championship Retention Distribution")
plt.show()

plt.figure(figsize=(10,6))
sns.lineplot(data=season_champs, x='year', y='points', marker='o', color='darkgreen')
plt.title("Champion Points over Seasons")
plt.xlabel("Year")
plt.ylabel("Champion Points")
plt.show()

print("Historical Championship Retention Rate:", retention_rate)


# **PS - 15**

In [None]:
#Champion Age Trends
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
drivers = pd.read_csv("/content/drive/My Drive/DPL_Datasets/drivers.csv")

results = results.merge(races[['raceId','year']], on='raceId', how='left')
driver_points = results.groupby(['year','driverId'])['points'].sum().reset_index()
season_champs = driver_points.loc[driver_points.groupby('year')['points'].idxmax()].sort_values('year')
drivers['dob'] = pd.to_datetime(drivers['dob'], errors='coerce')
season_champs = season_champs.merge(drivers[['driverId','dob']], on='driverId', how='left')
season_champs['age'] = season_champs['year'] - season_champs['dob'].dt.year
season_champs['decade'] = (season_champs['year'] // 10) * 10

plt.figure(figsize=(10,6))
sns.boxplot(x='decade', y='age', data=season_champs, palette='Set3')
plt.title("Champion Age Trends by Decade")
plt.xlabel("Decade")
plt.ylabel("Champion Age")
plt.show()

age_trends = season_champs.groupby('decade')['age'].describe()
print("Champion Age Trends by Decade:")
print(age_trends)


# **PS - 16**

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load datasets
results = pd.read_csv("/content/drive/My Drive/DPL_Datasets/results.csv")
races = pd.read_csv("/content/drive/My Drive/DPL_Datasets/races.csv")
constructors = pd.read_csv("/content/drive/My Drive/DPL_Datasets/constructors.csv")

# Merge race info and prepare driver-team history
results = results.merge(races[['raceId', 'year']], on='raceId', how='left')
driver_teams = results[['year', 'driverId', 'constructorId']].drop_duplicates().sort_values(['driverId', 'year'])
driver_transitions = driver_teams.groupby('driverId')['constructorId'].apply(list).reset_index()

# Build the directed graph of team transitions
G = nx.DiGraph()
transition_counts = {}

for _, row in driver_transitions.iterrows():
    teams = row['constructorId']
    if len(teams) > 1:
        for i in range(len(teams) - 1):
            if G.has_edge(teams[i], teams[i+1]):
                G[teams[i]][teams[i+1]]['weight'] += 1
            else:
                G.add_edge(teams[i], teams[i+1], weight=1)
            transition_counts[teams[i]] = transition_counts.get(teams[i], 0) + 1

# Map team IDs to team names
team_names = constructors.set_index('constructorId')['name'].to_dict()
labels = {node: team_names.get(node, str(node)) for node in G.nodes()}

# Use Spring layout for better spacing
pos = nx.spring_layout(G, k=1.2, seed=42)  # 'k' controls node spacing

# Scale node sizes based on number of transitions
node_sizes = [transition_counts.get(node, 1) * 50 for node in G.nodes()]

# Reduce opacity of less frequent edges
edges = G.edges(data=True)
edge_weights = [data['weight'] for (_, _, data) in edges]
edge_widths = [0.3 * w for w in edge_weights]  # Thinner edges
edge_colors = ['gray' if w < 5 else 'black' for w in edge_weights]  # Highlight frequent moves

# Use distinct colors for nodes
cmap = plt.get_cmap('tab20')
color_map = {node: cmap(i % 20) for i, node in enumerate(G.nodes())}

# Plot the refined graph
plt.figure(figsize=(16, 12))
nx.draw_networkx_nodes(G, pos,
                       node_color=[color_map[node] for node in G.nodes()],
                       node_size=node_sizes,
                       alpha=0.85)
nx.draw_networkx_edges(G, pos,
                       arrowstyle='->',
                       arrowsize=15,
                       edge_color=edge_colors,
                       width=edge_widths,
                       alpha=0.6)

important_labels = {k: v for k, v in labels.items() if transition_counts.get(k, 0) > 5}
nx.draw_networkx_labels(G, pos, important_labels, font_size=10, font_color='black')

plt.title("Driver Team Transfer Trends", fontsize=18, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
##Markov chain future predictions

team_list = list(G.nodes())
transition_matrix = np.zeros((len(team_list), len(team_list)))

# Build transition probabilities
for i, team1 in enumerate(team_list):
    total_transitions = sum([G[team1][team2]['weight'] for team2 in G.successors(team1)])
    for j, team2 in enumerate(team_list):
        if G.has_edge(team1, team2):
            transition_matrix[i, j] = G[team1][team2]['weight'] / total_transitions

# Convert to DataFrame
transition_df = pd.DataFrame(transition_matrix, index=team_list, columns=team_list)

# Function to predict the next team
def predict_next_team(current_team):
    if current_team not in transition_df.index:
        return "No transition data available"
    next_team = transition_df.loc[current_team].idxmax()
    return team_names.get(next_team, "Unknown Team")

# Example Prediction: Predict the next team for a driver
driver_name = "Jarno"  # Try different surnames like "Hamilton", "Verstappen"

# Find the driver ID from their name
driver_row = drivers[drivers["forename"].str.lower() == driver_name.lower()]
if not driver_row.empty:
    driver_id = driver_row.iloc[0]["driverId"]
    driver_team_history = driver_teams[driver_teams["driverId"] == driver_id]["constructorId"].tolist()

    if driver_team_history:
        last_team = driver_team_history[-1]
        predicted_team = predict_next_team(last_team)
        print(f"Predicted next team for {driver_name}: {predicted_team}")
    else:
        print(f"No team history available for {driver_name}")
else:
    print(f"Driver '{driver_name}' not found in the dataset.")


In [None]:
# team_names = dict(zip(constructors["constructorId"], constructors["name"]))
# transition_df = transition_df.rename(index=team_names, columns=team_names)
# print(transition_df)