# Assignment 7

## Submit as an HTML file

### Print your name below

In [None]:
print("Derry Li")

### Import the "pandas" "numpy" and "statsmodels.formula.api" libraries

In [9]:
# Write your answer here:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from datetime import date, time, datetime
import statsmodels.formula.api as smf


#### In the code chunk below read the CSV file named `results.csv` in the `data` <br> folder and print the first 5 rows of the dataset. Browse the dataset.

In [10]:
df = pd.read_csv('data/results.csv')

### (a)  Check Column Types and Data Cleaning

- Use the function .dtypes to get the column types
- Identify which columns have data types that might need conversion
- The 'milliseconds' column contains string values that should be numeric. Create a new column called 'race_time_ms' that:
    - Converts the column to a numeric data type
    - Replaces any non-numeric values with NaN

In [13]:

list_unique = pd.unique(df["milliseconds"])
print(list_unique)

list_old = ["\\N"] 
list_new = [np.nan]  

df['race_time_ms'] = df['milliseconds'].replace(list_old, list_new)

df['race_time_ms'] = pd.to_numeric(df['race_time_ms'], errors='coerce')





['5690616' '5696094' '5698779' ... '5349182' '5349812' '5355285']


### (b) Create Categorical Variables

- Create a new column called 'finish_category' that categorizes the race finish positions as follows:
    - Positions 1-3: 'Podium'
    - Positions 4-10: 'Points'
    - Positions 11-20: 'Midfield'
    - Positions >20: 'Backmarker'

Hint: Use the pd.cut() function

In [14]:
# Write your answer here
bins = [0, 3, 10, 20, float('inf')]
labels = ['Podium', 'Points', 'Midfield', 'Backmarker']

df['finish_category'] = pd.cut(df['positionOrder'],
                              bins=bins,
                              labels=labels,
                              right=True)




### (c) Calculate Race Duration
- For rows where 'milliseconds' is available, create a new column <br>
'race_duration_minutes' that converts milliseconds to minutes by dividing <br>
by (1000*60).
- Display the average race duration by 'constructorId' for the top 5 <br>
constructors with the shortest average race times

In [16]:
df['race_duration_minutes'] = df['race_time_ms'] / (1000 * 60)

times = df.groupby('constructorId')['race_duration_minutes'].mean().sort_values()

fastest = times.iloc[:5]
print(fastest)

constructorId
35    76.710777
29    77.604125
41    87.046767
16    89.428828
53    89.658852
Name: race_duration_minutes, dtype: float64


### (d) Driver Performance Analysis

- Calculate the following statistics for each driver, grouped by 'driverId':
    - Average finishing position
    - Total points
    - Number of races completed
    - Best finishing position

- Sort the results by total points in descending order
- Display the top 10 drivers based on total points

In [18]:
df['positionOrder'] = pd.to_numeric(df['positionOrder'], errors='coerce')
df['points'] = pd.to_numeric(df['points'], errors='coerce')

driver_stats = df.groupby('driverId').agg(
    avg_finish = ('positionOrder', 'mean'),
    best_finish = ('positionOrder', 'min'),
    total_points = ('points', 'sum'),
    races_completed = ('resultId', 'count')
)

top_10_drivers = driver_stats.sort_values('total_points', ascending=False).iloc[:10]

print(top_10_drivers)

          avg_finish  best_finish  total_points  races_completed
driverId                                                        
1           4.787097            1        4396.5              310
20          7.093333            1        3098.0              300
4           8.494413            1        2061.0              358
830         6.533742            1        1983.5              163
8           8.491477            1        1873.0              352
822         7.601990            1        1778.0              201
3           8.252427            1        1594.5              206
30          6.879870            1        1566.0              308
817         9.883621            1        1307.0              232
18          9.695793            1        1235.0              309


### (e) Linear Regression
Create a linear regression model that predicts 'points' based on 'grid' (starting position) and 'laps' completed <br>
Use the following steps:

- Clean the data to remove any non-numeric values and missing values
- Create the regression formula using smf.ols 
- Display the summary of the regression model using model.summary()

What is the predicted points for a driver starting in position 3 and completing 55 laps?

Hint: Use ```.dropna()''' to remove missing values from the points, grid, and laps <br>
variables.

In [22]:
regression_data = df[['points', 'grid', 'laps']].dropna()

regression_data = regression_data.apply(pd.to_numeric).dropna()

model = smf.ols(formula='points ~ grid + laps', data=regression_data).fit()

print(model.summary())

data = pd.DataFrame({'grid': [3], 'laps': [55]})
points = model.predict(data)

print(points.iloc[0])

                            OLS Regression Results                            
Dep. Variable:                 points   R-squared:                       0.215
Model:                            OLS   Adj. R-squared:                  0.215
Method:                 Least Squares   F-statistic:                     3530.
Date:                Mon, 24 Mar 2025   Prob (F-statistic):               0.00
Time:                        14:52:04   Log-Likelihood:                -70440.
No. Observations:               25840   AIC:                         1.409e+05
Df Residuals:                   25837   BIC:                         1.409e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.5841      0.054     48.267      0.0