In [None]:
## Cell 1: Importing necessary libraries




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests

In [None]:
## Cell 2: Loading the dataset

In [2]:
# Load the Formula 1 dataset from Kaggle


FileNotFoundError: [Errno 2] No such file or directory: 'kaggle datasets download -d kshitij9/f1-2022-dataset'

In [None]:
## Cell 3: Data cleaning and preprocessing

In [None]:
# Convert date columns to datetime format
df['date'] = pd.to_datetime(df['date'])

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

In [None]:
## Cell 4: Driver performance analysis

In [None]:
# Group by driver and calculate average finishing position
driver_performance = df.groupby('driver')['position'].mean().reset_index()

# Plot driver performance over time
plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='position', data=driver_performance)
plt.title('Driver Performance Over Time')
plt.xlabel('Year')
plt.ylabel('Average Finishing Position')
plt.show()

In [None]:
## Cell 5: Team dominance analysis

In [None]:
# Group by team and calculate total points
team_points = df.groupby('team')['points'].sum().reset_index()

# Plot team points over time
plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='points', data=team_points)
plt.title('Team Dominance Over Time')
plt.xlabel('Year')
plt.ylabel('Total Points')
plt.show()

In [None]:
## Cell 6: Lap time improvements analysis

In [None]:
# Group by circuit and calculate average lap time
lap_times = df.groupby('circuit')['lap_time'].mean().reset_index()

# Plot lap time improvements over time
plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='lap_time', data=lap_times)
plt.title('Lap Time Improvements Over Time')
plt.xlabel('Year')
plt.ylabel('Average Lap Time')
plt.show()

In [None]:
## Cell 7: Starting position impact analysis

In [None]:
# Group by starting position and calculate average finishing position
starting_position = df.groupby('starting_position')['position'].mean().reset_index()

# Plot starting position impact on race outcome
plt.figure(figsize=(10, 6))
sns.scatterplot(x='starting_position', y='position', data=starting_position)
plt.title('Starting Position Impact on Race Outcome')
plt.xlabel('Starting Position')
plt.ylabel('Average Finishing Position')
plt.show()

In [None]:
## Cell 8: Safety and crashes analysis

In [None]:
# Group by year and calculate number of crashes
crashes = df.groupby('year')['crash'].sum().reset_index()

# Plot crashes over time
plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='crash', data=crashes)
plt.title('Crashes Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Crashes')
plt.show()

In [None]:
## Cell 9: Correlation analysis


In [None]:
# Calculate correlation between starting position and finishing position
corr_matrix = df[['starting_position', 'position']].corr()

# Print correlation matrix
print(corr_matrix)

In [None]:
## Cell 10: Visualization - Heatmap

In [None]:
# Create a heatmap to show which circuits favor certain drivers
sns.set(font_scale=1.5)
plt.figure(figsize=(12, 10))
sns.heatmap(df.pivot_table(index='circuit', columns='driver', values='position').corr(), annot=True, cmap='coolwarm', square=True)
plt.title('Circuit-Driver Correlation')
plt.xlabel('Driver')
plt.ylabel('Circuit')
plt.show()

In [None]:
## Cell 11: Visualization - Bar Chart

In [None]:
# Create a bar chart to show team performance over time
plt.figure(figsize=(12, 8))
sns.barplot(x='year', y='points', hue='team', data=df)
plt.title('Team Performance Over Time')
plt.xlabel('Year')
plt.ylabel('Points')
plt.legend(title='Team')
plt.show()

In [None]:
## Cell 12: Visualization - Scatter Plot

In [None]:
# Create a scatter plot to show the relationship between starting position and finishing position
plt.figure(figsize=(10, 6))
sns.scatterplot(x='starting_position', y='position', data=df)
plt.title('Starting Position vs Finishing Position')
plt.xlabel('Starting Position')
plt.ylabel('Finishing Position')
plt.show()

In [None]:
## Cell 13: Visualization - Line Chart

In [None]:
# Create a line chart to show the average finishing position of each driver over time
plt.figure(figsize=(12, 8))
sns.lineplot(x='year', y='position', hue='driver', data=df)
plt.title('Driver Performance Over Time')
plt.xlabel('Year')
plt.ylabel('Average Finishing Position')
plt.legend(title='Driver')
plt.show()

In [None]:
## Cell 14: Data Preparation for Machine Learning

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = df.drop(['position'], axis=1)
y = df['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
## Cell 15: Machine Learning Model

In [None]:
# Train a linear regression model on the training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test_scaled)

# Evaluate the model using mean squared error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

In [None]:
## Cell 16: Hyperparameter Tuning

In [None]:
# Perform hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the corresponding score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.2f}')

In [None]:
## Cell 17: Model Evaluation

In [None]:
# Evaluate the model using classification metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = grid_search.best_estimator_.predict(X_test_scaled)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')

In [None]:
## Cell 18: Model Deployment

In [None]:
# Save the trained model to a file
import pickle
with open('f1_model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# Load the saved model from the file
with open('f1_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions using the loaded model
y_pred_loaded = loaded_model.predict(X_test_scaled)

In [None]:
## Cell 19: Model Interpretation

In [None]:
# Get the feature importance of the model
feature_importance = grid_search.best_estimator_.coef_
print(f'Feature Importance:\n{feature_importance}')

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=df.columns, y=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
## Cell 20: Model Comparison

In [None]:
# Compare the performance of different models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = [RandomForestClassifier(), SVC(), LogisticRegression()]
model_names = ['Random Forest', 'SVM', 'Logistic Regression']

for model, name in zip(models, model_names):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name}: {accuracy:.2f}')

In [None]:
## Cell 21: Model Selection

In [None]:
# Select the best model based on the performance
best_model = models[np.argmax([accuracy_score(y_test, model.predict(X_test_scaled)) for model in models])]
print(f'Best Model: {best_model.__class__.__name__}')

In [None]:
## Cell 22: Model Deployment to Cloud

In [None]:
# Deploy the model to a cloud platform
import joblib
joblib.dump(best_model, 'f1_model.joblib')

# Load the deployed model from the cloud
loaded_model = joblib.load('f1_model.joblib')

In [None]:
## Cell 23: Model Monitoring

In [None]:
# Monitor the performance of the deployed model
from sklearn.metrics import accuracy_score
y_pred = loaded_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

In [None]:
## Cell 24: Model Maintenance

In [None]:
# Update the deployed model with new data
from sklearn.model_selection import train_test_split
new_data = pd.read_csv('new_data.csv')
X_new = new_data.drop(['target'], axis=1)
y_new = new_data['target']
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

loaded_model.fit(X_train_new, y_train_new)

In [None]:
## Cell 25: Model Retraining

In [None]:
# Retrain the model with the updated data
loaded_model.fit(X_train_new, y_train_new)

# Evaluate the retrained model
y_pred_new = loaded_model.predict(X_test_new)
accuracy_new = accuracy_score(y_test_new, y_pred_new)
print(f'Retrained Model Accuracy: {accuracy_new:.2f}')

In [None]:
## Cell 26: Model Comparison with Baseline

In [None]:
# Compare the performance of the retrained model with a baseline model
from sklearn.dummy import DummyClassifier
baseline_model = DummyClassifier(strategy='most_frequent')
baseline_model.fit(X_train_new, y_train_new)
y_pred_baseline = baseline_model.predict(X_test_new)
accuracy_baseline = accuracy_score(y_test_new, y_pred_baseline)
print(f'Baseline Model Accuracy: {accuracy_baseline:.2f}')

# Compare the performance of the retrained model with the baseline model
print(f'Retrained Model vs Baseline Model: {accuracy_new - accuracy_baseline:.2f}')

In [None]:
## Cell 27: Model Deployment to Production

In [None]:
# Deploy the retrained model to production
import joblib
joblib.dump(loaded_model, 'f1_model_production.joblib')

# Load the deployed model from production
loaded_model_production = joblib.load('f1_model_production.joblib')

In [None]:
## Cell 28: Model Monitoring in Production

In [None]:
# Monitor the performance of the deployed model in production
from sklearn.metrics import accuracy_score
y_pred_production = loaded_model_production.predict(X_test_new)
accuracy_production = accuracy_score(y_test_new, y_pred_production)
print(f'Model Accuracy in Production: {accuracy_production:.2f}')

In [None]:
## Cell 29: Model Maintenance in Production

In [None]:
# Update the deployed model in production with new data
from sklearn.model_selection import train_test_split
new_data_production = pd.read_csv('new_data_production.csv')
X_new_production = new_data_production.drop(['target'], axis=1)
y_new_production = new_data_production['target']
X_train_new_production, X_test_new_production, y_train_new_production, y_test_new_production = train_test_split(X_new_production, y_new_production, test_size=0.2, random_state=42)

loaded_model_production.fit(X_train_new_production, y_train_new_production)

In [None]:
## Cell 30: Model Retraining in Production

In [None]:
# Retrain the model in production with the updated data
loaded_model_production.fit(X_train_new_production, y_train_new_production)

# Evaluate the retrained model in production
y_pred_new_production = loaded_model_production.predict(X_test_new_production)
accuracy_new_production = accuracy_score(y_test_new_production, y_pred_new_production)
print(f'Retrained Model Accuracy in Production: {accuracy_new_production:.2f}')

In [None]:
## Cell 31: Model Comparison with Baseline in Production

In [None]:
# Compare the performance of the retrained model in production with a baseline model
from sklearn.dummy import DummyClassifier
baseline_model_production = DummyClassifier(strategy='most_frequent')
baseline_model_production.fit(X_train_new_production, y_train_new_production)
y_pred_baseline_production = baseline_model_production.predict(X_test_new_production)
accuracy_baseline_production = accuracy_score(y_test_new_production, y_pred_baseline_production)
print(f'Baseline Model Accuracy in Production: {accuracy_baseline_production:.2f}')

# Compare the performance of the retrained model in production with the baseline model
print(f'Retrained Model vs Baseline Model in Production: {accuracy_new_production - accuracy_baseline_production:.2f}')

In [None]:
## Cell 32: Model Deployment to Cloud in Production

In [None]:
# Deploy the retrained model in production to a cloud platform
import joblib
joblib.dump(loaded_model_production, 'f1_model_production_cloud.joblib')

# Load the deployed model from the cloud in production
loaded_model_production_cloud = joblib.load('f1_model_production_cloud.joblib')

In [None]:
## Cell 33: Model Monitoring in Cloud in Production

In [None]:
# Monitor the performance of the deployed model in cloud in production
from sklearn.metrics import accuracy_score
y_pred_production_cloud = loaded_model_production_cloud.predict(X_test_new_production)
accuracy_production_cloud = accuracy_score(y_test_new_production, y_pred_production_cloud)
print(f'Model Accuracy in Cloud in Production: {accuracy_production_cloud:.2f}')

In [None]:
## Cell 34: Model Maintenance in Cloud in Production

In [None]:
# Update the deployed model in cloud in production with new data
from sklearn.model_selection import train_test_split
new_data_production_cloud = pd.read_csv('new_data_production_cloud.csv')
X_new_production_cloud = new_data_production_cloud.drop(['target'], axis=1)
y_new_production_cloud = new_data_production_cloud['target']
X_train_new_production_cloud, X_test_new_production_cloud, y_train_new_production_cloud, y_test_new_production_cloud = train_test_split(X_new_production_cloud, y_new_production_cloud, test_size=0.2, random_state=42)

loaded_model_production_cloud.fit(X_train_new_production_cloud, y_train_new_production_cloud)

In [None]:
## Cell 35: Model Retraining in Cloud in Production

In [None]:
# Retrain the model in cloud in production with the updated data
loaded_model_production_cloud.fit(X_train_new_production_cloud, y_train_new_production_cloud)

# Evaluate the retrained model in cloud in production
y_pred_new_production_cloud = loaded_model_production_cloud.predict(X_test_new_production_cloud)
accuracy_new_production_cloud = accuracy_score(y_test_new_production_cloud, y_pred_new_production_cloud)
print(f'Retrained Model Accuracy in Cloud in Production: {accuracy_new_production_cloud:.2f}')

In [None]:
## Cell 36: Model Comparison with Baseline in Cloud in Production

In [None]:
# Compare the performance of the retrained model in cloud in production with a baseline model
from sklearn.dummy import DummyClassifier
baseline_model_production_cloud = DummyClassifier(strategy='most_frequent')
baseline_model_production_cloud.fit(X_train_new_production_cloud, y_train_new_production_cloud)
y_pred_baseline_production_cloud = baseline_model_production_cloud.predict(X_test_new_production_cloud)
accuracy_baseline_production_cloud = accuracy_score(y_test_new_production_cloud, y_pred_baseline_production_cloud)
print(f'Baseline Model Accuracy in Cloud in Production: {accuracy_baseline_production_cloud:.2f}')

# Compare the performance of the retrained model in cloud in production with the baseline model
print(f'Retrained Model vs Baseline Model in Cloud in Production: {accuracy_new_production_cloud - accuracy_baseline_production_cloud:.2f}')

In [None]:
## Cell 37: Model Deployment to Edge in Production

In [None]:
# Deploy the retrained model in cloud in production to an edge device
import joblib
joblib.dump(loaded_model_production_cloud, 'f1_model_production_edge.joblib')

# Load the deployed model from the edge device in production
loaded_model_production_edge = joblib.load('f1_model_production_edge.joblib')

In [None]:
## Cell 38: Model Monitoring in Edge in Production

In [None]:
# Monitor the performance of the deployed model in edge in production
from sklearn.metrics import accuracy_score
y_pred_production_edge = loaded_model