# 9. Model Interpretility 

In [1]:
# Basic Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

import joblib

# If you get an ImportError, you may need to install these packages using pip.
try:
    # Import shap, lime
    import shap
    import lime
    import lime.lime_tabular
except ImportError as e:
    print(e)
    print("You need to install the missing modules using pip (e.g., 'pip install shap lime xgboost')")


In [2]:
pip install -U kaleido

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load DataFrame
file = 'C:/Users/Alex/OneDrive/BrainStation/Data_Science_Bootcamp/Capstone_Project/capstone-Aboard89/Notebooks/model_data_v2.csv'
df = pd.read_csv(file)

In [4]:
df.head()

Unnamed: 0,year,age,years_in_f1,races_with_each_team_since_1995,F2_champion,Former_F1_World_Champion,home_race,starting_grid_position,points_in_previous_race,laps_in_previous_race,...,constructorId_3,constructorId_30,constructorId_31,constructorId_4,constructorId_5,constructorId_51,constructorId_6,constructorId_7,constructorId_8,constructorId_9
0,1995,29,4,1,0,0,0,17,1.0,70,...,False,False,False,False,False,False,False,False,False,False
1,1995,26,4,1,0,0,0,5,3.0,70,...,False,False,False,False,False,False,False,False,False,False
2,1995,24,1,1,0,0,0,1,6.0,71,...,True,False,False,False,False,False,False,False,False,False
3,1995,34,3,1,0,1,0,2,0.0,30,...,True,False,False,False,False,False,False,False,False,False
4,1995,30,6,1,0,0,0,6,2.0,70,...,False,False,False,False,False,False,True,False,False,False


### 2. Load Random Forest Model

In [5]:
import joblib  # or you can use: import pickle

# Load the Random Forest model
model = joblib.load('C:/Users/Alex/OneDrive/BrainStation/Data_Science_Bootcamp/Capstone_Project/capstone-Aboard89/Notebooks/random_forest_grid_search.pkl')
# If you prefer pickle, use: model = pickle.load(open('/mnt/data/random_forest_grid_search.pkl', 'rb'))


### 3. Extract Feature Importance from Model

In [6]:
# Assuming 'grid_search' is your GridSearchCV object and it has been fitted:
best_model = model.best_estimator_

In [7]:
model.best_estimator_.named_steps['scl'].get_feature_names_out()

array(['year', 'age', 'years_in_f1', 'races_with_each_team_since_1995',
       'F2_champion', 'Former_F1_World_Champion', 'home_race',
       'starting_grid_position', 'points_in_previous_race',
       'laps_in_previous_race', 'constructorId_points_at_stage_of_season',
       'driver_points_at_stage_of_season',
       'race_70th Anniversary Grand Prix', 'race_Abu Dhabi Grand Prix',
       'race_Argentine Grand Prix', 'race_Australian Grand Prix',
       'race_Austrian Grand Prix', 'race_Azerbaijan Grand Prix',
       'race_Bahrain Grand Prix', 'race_Belgian Grand Prix',
       'race_Brazilian Grand Prix', 'race_British Grand Prix',
       'race_Canadian Grand Prix', 'race_Chinese Grand Prix',
       'race_Dutch Grand Prix', 'race_Eifel Grand Prix',
       'race_Emilia Romagna Grand Prix', 'race_European Grand Prix',
       'race_French Grand Prix', 'race_German Grand Prix',
       'race_Hungarian Grand Prix', 'race_Indian Grand Prix',
       'race_Italian Grand Prix', 'race_Japanese Gr

In [8]:
model.best_estimator_.named_steps["clf"].feature_importances_

array([2.87836988e-02, 1.78213832e-02, 3.23843307e-02, 3.67594797e-02,
       6.28940504e-03, 3.23629915e-02, 1.13718275e-03, 2.19269284e-01,
       7.81352986e-02, 1.42111440e-02, 7.51876591e-02, 8.25359352e-02,
       8.39753885e-05, 9.70689804e-04, 1.07573486e-04, 9.99402166e-04,
       8.56895482e-04, 2.98418661e-04, 6.85654967e-04, 1.62142659e-03,
       2.98761241e-03, 3.57895305e-03, 1.54899017e-03, 1.38254858e-03,
       9.95410290e-05, 7.70883529e-05, 1.29589880e-04, 1.34676325e-03,
       9.28641746e-04, 2.04295439e-03, 3.49223345e-03, 1.61530431e-04,
       2.98637700e-03, 2.12937118e-03, 1.88067082e-04, 1.00268700e-04,
       8.76558503e-04, 3.96408500e-04, 1.88420947e-04, 9.60938721e-05,
       1.90518567e-03, 6.13583078e-05, 2.08488235e-04, 3.92583235e-05,
       5.93505262e-04, 6.27175699e-05, 5.48906896e-04, 2.19939559e-04,
       1.24997376e-03, 1.89446303e-03, 4.39128276e-05, 9.64772282e-05,
       8.47427093e-04, 2.38148698e-05, 1.90861767e-03, 6.83085235e-10,
      

In [9]:
# 'feature_names' is the array with the feature names
#  'importances' is the array with the feature importances

feature_names = model.best_estimator_.named_steps['scl'].get_feature_names_out()  
importances = model.best_estimator_.named_steps["clf"].feature_importances_  

# Combine into a DataFrame
features_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})


In [10]:
features_df.head()

Unnamed: 0,Feature,Importance
0,year,0.028784
1,age,0.017821
2,years_in_f1,0.032384
3,races_with_each_team_since_1995,0.036759
4,F2_champion,0.006289


In [11]:
# Extract the 10 most important features
top_10_positive = features_df.nlargest(10, 'Importance')

# Extract the 10 least important features (which you may be referring to as 'negative')
top_10_negative = features_df.nsmallest(10, 'Importance')

In [12]:
import plotly.express as px

# Assuming 'top_10_positive' is your DataFrame with the correct columns 'Feature' and 'Importance'
color_discrete_map = {'starting_grid_position': 'red'}
for feature in top_10_positive['Feature']:
    if feature != 'starting_grid_position':
        color_discrete_map[feature] = 'blue'

# To create a horizontal bar chart, swap the x and y parameters.
fig_positive = px.bar(
    top_10_positive,
    y='Feature',  # Features will be on the y-axis
    x='Importance',  # Importances will be on the x-axis
    title='Starting Position - Most Important Feature for Predicting Wins',
    color='Feature',  # This assigns colors based on the feature names
    color_discrete_map=color_discrete_map,  # Apply the color map
    orientation='h'  # This specifies that the bars should be horizontal
)

# For a horizontal bar chart, you might want to adjust the y-axis labels instead.
fig_positive.update_layout(yaxis_tickangle=0, showlegend=False)

# Save the figure as an image with 300 dpi
fig_positive.write_image("feature_importance.png", width=800, height=600, scale=3)  # Adjust width, height, and scale as needed

# Show the figure
fig_positive.show()


This chart showcases the top 10 most influential factors as determined by the fine-tuned Random Forest SMOTE model (from notebook 8) for predicting the winner of an F1 race. 

1.**- Grid position is the most important factor for predicting race winners**
- The `driver_starting_grid_position` emerges as the most significant predictor, suggesting that the pole positions at the start of a race have a substantial impact on the final outcome. 

2.**- Historial performance is a key predictor in future performance**
Other important features include historical performance metrics such as 
- `driver_points_at_stage_of_season` - reflects a driver's cumulative success and consistency over the season, often translating into confidence and strategic advantage in subsequent races.

- `points_in_previous_race` - can signify a driver's or team's current competitive form, momentum, and adaptability to recent track conditions or regulations
- `constructorId_points_at_stage_of_season` - represents the team's performance, capturing the combined efforts of drivers, the quality of the car, and the team, in that current season.  

3.**- Experience is important in F1**
- `years_in_f1` -  
This is a crucial factor as it encapsulates a driver's tenure and the wealth of experience gained in navigating varied race situations, which can be pivotal in making split-second decisions that lead to victories.
- `races_with_each_team_since_1995` *(this is basically how many races a person has had with a team. The counting starts in 1995)* 
This is indicative of a driver's familiarity and synergy with their team's culture, machinery, and strategy. The more races a driver has under their belt with a particular team, the better they can work together to optimize performance and achieve successful outcomes on race day. It's this blend of individual expertise and team harmony, honed over years and races, that often drives a team to the winner's circle.

4. **The historical success of teams** (`constructorId_131` - Mercedes and `constructorId_6` - Ferrari)
The performance of a Formula 1 team is deeply intertwined with its resources, engineering prowess, and historical pedigree, all of which are often encapsulated in the identity of top teams like Mercedes (constructorId_131) and Ferrari (constructorId_6). These identifiers in the dataset are proxies for the technological advantage and strategic expertise these teams bring to the race. Mercedes and Ferrari have a legacy of innovation and a track record of securing championships, making their associated features significant in predicting race wins. A team's constructorId can signal the likelihood of success based on past performance trends, financial backing for development, and the ability to attract top talent, all of which are critical components in the complex equation of winning races.

### Local model-agnostic methods

try and find out what rows were predicted wrong

1. Shapley Values

In [16]:
# Load Test Data from Notebook : 8.F1_Models_v3.ipynb

file = 'C:/Users/Alex/OneDrive/BrainStation/Data_Science_Bootcamp/Capstone_Project/capstone-Aboard89/Notebooks/test_data_v2.csv'
df_test = pd.read_csv(file)

In [17]:
df_test.head()

Unnamed: 0,year,age,years_in_f1,races_with_each_team_since_1995,F2_champion,Former_F1_World_Champion,home_race,starting_grid_position,points_in_previous_race,laps_in_previous_race,...,constructorId_3,constructorId_30,constructorId_31,constructorId_4,constructorId_5,constructorId_51,constructorId_6,constructorId_7,constructorId_8,constructorId_9
0,1998,29,7,36,0,1,0,2,4.0,72,...,False,False,False,False,False,False,True,False,False,False
1,2020,33,13,106,0,1,0,10,8.0,69,...,False,False,False,False,False,False,True,False,False,False
2,2009,35,12,88,0,0,0,2,0.0,61,...,False,False,False,False,False,False,False,True,False,False
3,2005,28,3,4,0,0,0,4,3.0,57,...,True,False,False,False,False,False,False,False,False,False
4,2014,24,3,10,0,0,0,10,0.0,51,...,False,False,False,False,False,False,False,False,False,False


In [18]:
%%time
# This calculation might take 2-3 minutes!

# Extract the RandomForestClassifier from the pipeline
rf_model = best_model.named_steps['clf']

# Now you can create a TreeExplainer with the extracted RandomForest model
explainer = shap.TreeExplainer(rf_model)

# If 'df_test' includes the target variable or additional columns, drop them
df_test = df_test.drop(columns=['race_win'])

# Transform df_test using the scaler from the pipeline
df_test_transformed = best_model.named_steps['scl'].transform(df_test)

# Now obtain SHAP values for the transformed test set
shap_values = explainer.shap_values(df_test_transformed)


CPU times: total: 4min 34s
Wall time: 6min 39s


In [19]:
for array in shap_values:
    print(array.shape)


(2185, 180)
(2185, 180)


The shape attribute of an array returns a tuple representing the dimensions of the array. In this case, shap_values array.shape is returning (2185, 180), which tells us the following:

There are 2185 instances (e.g., rows, observations, or samples) in the test dataset.
For each instance, there are 180 SHAP values corresponding to the features used in your model.

In [20]:
print([i for i in df_test.columns])
print([i for i in shap_values[0]])
print([i for i in shap_values[10]])

['year', 'age', 'years_in_f1', 'races_with_each_team_since_1995', 'F2_champion', 'Former_F1_World_Champion', 'home_race', 'starting_grid_position', 'points_in_previous_race', 'laps_in_previous_race', 'constructorId_points_at_stage_of_season', 'driver_points_at_stage_of_season', 'race_70th Anniversary Grand Prix', 'race_Abu Dhabi Grand Prix', 'race_Argentine Grand Prix', 'race_Australian Grand Prix', 'race_Austrian Grand Prix', 'race_Azerbaijan Grand Prix', 'race_Bahrain Grand Prix', 'race_Belgian Grand Prix', 'race_Brazilian Grand Prix', 'race_British Grand Prix', 'race_Canadian Grand Prix', 'race_Chinese Grand Prix', 'race_Dutch Grand Prix', 'race_Eifel Grand Prix', 'race_Emilia Romagna Grand Prix', 'race_European Grand Prix', 'race_French Grand Prix', 'race_German Grand Prix', 'race_Hungarian Grand Prix', 'race_Indian Grand Prix', 'race_Italian Grand Prix', 'race_Japanese Grand Prix', 'race_Korean Grand Prix', 'race_Luxembourg Grand Prix', 'race_Malaysian Grand Prix', 'race_Mexican G

IndexError: list index out of range

In [None]:
# Initiate Javascript for visualization
shap.initjs()

# Plot SHAP values for row 0 of the positive class
shap.force_plot(
    explainer.expected_value[1],  # Use the expected value for the positive class
    shap_values[1][0],            # Use the SHAP values for the positive class and first row
    features=df_test_transformed[0],  # Use the corresponding row from the transformed features
    feature_names=df_test.columns  # Use the feature names from the original test set
)


The image above shows the output of a SHAP (SHapley Additive exPlanations) force plot. SHAP is a tool used to explain the output of machine learning models by computing the contribution of each feature to the prediction for individual samples.

Here's a breakdown of what the visualization indicates:

1. **Base Value**: The plot starts with a base value, which is the average prediction for the dataset (or the expected value of the model's output). In this case, the base value is around 0.5003, which might represent the average probability of class 1 in our classification model.

2. **Output Value**: The value at the end of the plot, 0.67, is the actual prediction for the specific instance being explained. This prediction is higher than the base value, indicating that the combined effect of the features has increased the prediction.

3. **Feature Contributions**: Each feature's contribution to the shift from the base value to the output value is shown by colored arrows:
   - **Red Arrows**: Features that push the prediction higher (increase the probability of the positive class). The width of each arrow is proportional to the size of the feature's effect.
   - **Blue Arrows**: Features that push the prediction lower (decrease the probability of the positive class).

4. **Feature Values**: The labels on the arrows indicate the feature names and their values for the particular instance being explained. For example:
   - `Former_F1_World_Champion = 1.186`: Being a former F1 World Champion increases the prediction by this SHAP value.
   - `constructorId_6 = 1.696`: This driver drives for Ferarri, which also increases the prediction of winning.
   - `starting_grid_position = -0.7182`: A lower starting grid position increases the prediction of winning.

5. **Direction of Influence**: The plot is directional, with features that influence the prediction towards a higher probability on one side (red) and those that influence towards a lower probability on the other side (blue).

In summary, the SHAP force plot is a powerful tool for interpreting the predictions of complex models, giving insights into which features are most influential for a particular prediction.

### Global Interpretation

In [None]:
# Global interpretation - summary plot - type = 'dot'
shap.initjs()

shap.summary_plot(shap_values, features=df_test, feature_names=df_test.columns)

NameError: name 'shap' is not defined

In [None]:
shap.summary_plot(shap_values, features=df_test, feature_names=df_test.columns, show=False)
plt.savefig('shap_summary_plot.png', bbox_inches='tight')


NameError: name 'shap_values' is not defined

# Best Logistic Regression Coefficients

### 1. Load Logistic Regression Model

In [1]:
import joblib  # or you can use: import pickle

# Load the Random Forest model
model_log = joblib.load('C:/Users/Alex/OneDrive/BrainStation/Data_Science_Bootcamp/Capstone_Project/capstone-Aboard89/Notebooks/logistic_regression_grid_search.pkl')
# If you prefer pickle, use: model = pickle.load(open('/mnt/data/random_forest_grid_search.pkl', 'rb'))


In [3]:
model_log

In [6]:
model_log.best_estimator_.named_steps['scl'].get_feature_names_out() 

{'scl': StandardScaler(),
 'dim_reducer': PCA(),
 'clf': LogisticRegression(C=10, max_iter=1000, random_state=42)}

In [None]:
feature_names = model.best_estimator_.named_steps['scl'].get_feature_names_out()  
importances = model.best_estimator_.named_steps["clf"].c

In [4]:
# grid_search is GridSearchCV object
grid_search.fit(X_train, y_train)

# Get the best estimator
best_pipeline = grid_search.best_estimator_

# Access the logistic regression step of the pipeline
logistic_regression_step = best_pipeline.named_steps['logisticregression']

# Get the coefficients
coefficients = logistic_regression_step.coef_


NameError: name 'grid_search' is not defined

# 