### Case Study 3: Fuel Efficiency Versus Speed

**Objective:** Calculate the fuel efficiency in miles per gallon (MPG) or Liters per 100 km (L/100 km). Use regression to develop a model of how measured factors relate to the fuel efficiency.

Machine Learning for Engineers: [Automotive Monitoring](https://www.apmonitor.com/pds/index.php/Main/AutomotiveMonitoring)
- Description: Machine learning project with automotive data. Data includes travel distance, time, fuel rate, air flow, oxygen ratio, and other parameters available from an OBD2 interface.
- [Course Overview](https://apmonitor.com/pds)
- [Course Schedule](https://apmonitor.com/pds/index.php/Main/CourseSchedule)

<img width=400px align=left src='https://apmonitor.com/pds/uploads/Main/automotive_monitoring.png'>

In [None]:
import seaborn as sns
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from keras.layers import Dense
from keras.models import Sequential
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import MinMaxScaler as mms
from tqdm.keras import TqdmCallback
import keras

### Import Data and Display Column names

In [None]:
url = 'http://apmonitor.com/pds/uploads/Main/automotive.txt'
data = pd.read_csv(url)
data.rename(columns={'Calculated instant fuel consumption (MPG)':"MPG"},inplace=True)
data.columns

### Reformat Time

In [None]:
from datetime import datetime
dtobj = datetime.strptime(data['time'].iloc[0],'%I:%M:%S.%f')
dtobj2 = datetime.strptime(data['time'].iloc[-1],'%I:%M:%S.%f')
dataf = data.copy()
dataf['time'] = data['time'].apply(func=lambda x: datetime.strptime(x,'%I:%M:%S.%f'))
dataf['timetot'] = dataf['time'].apply(lambda x:(x-dtobj).microseconds/10**6+(x-dtobj).seconds)
dataf['timetot'].sample(5)

### Plot and Visualize the Data

In [None]:
label = 'MPG'
#predict MPG from RPM, and speed
sns.violinplot(data=dataf[label])
plt.xlabel(label)
plt.show()
dataf.boxplot('Vehicle speed (mph)')
plt.show()
dataf.boxplot('Engine RPM (rpm)')
plt.show()

## Reformat Data
Use Vehicle Speed, Acceleration, and RPM as features. Visualize the effects of well-known aspects of driving on fuel efficiency.

In [None]:
rpm = 'Engine RPM (rpm)'
speed = 'Vehicle speed (mph)'
speeds = dataf[speed].dropna()
acel = 'Vehicle acceleration (g)'
gal = 'Fuel used (gallon)'
alls = [acel,speed,rpm,label]
dataf[alls].isna().sum()
datan = dataf[alls].copy().fillna(method='ffill').drop_duplicates()
datan = datan[datan[label]>0]
datan = datan[datan[label]<45] #Anything higher than this is most likely coasting
datan.sample(20)

### Show Heatmap

In [None]:
sns.heatmap(dataf.copy().fillna(method='ffill').drop_duplicates().corr(),annot=False,fmt='f')
plt.show()

### Create Pairplot

In [None]:
sns.pairplot(datan)
plt.show()

### Scale Data, Train model, and show accuracy

In [None]:
s = mms()

use = alls.copy()
use.remove('MPG')

dataS = pd.DataFrame(np.sqrt(np.array(s.fit_transform(datan))), columns =alls)
Xtrain, Xtest, ytrain, ytest = tts(dataS[use], dataS[label], test_size = .2, random_state = 87)

model = xgb.XGBRegressor()

m = model.fit(dataS[use], dataS[label])

datatest = Xtest.join(ytest,how='right')
yp = m.predict(datatest[use])
unscaled = pd.DataFrame(s.inverse_transform((datatest.values)**2),columns = alls)
datap = datatest.copy()
datap[label] = yp
unscaledy = unscaled[label]
unscaledp = pd.DataFrame(s.inverse_transform(datap.values**2),columns=alls)[label]
perc = MAPE(unscaledy, unscaledp)

scoree = MAE(unscaledy, unscaledp)
scorer = r2(unscaledy, unscaledp)
plt.plot(unscaledy, unscaledp, 'bo')
plt.text(50,50,f'R2={round(scorer,3)}\nMAPE={round(perc,3)}\nMAE={round(scoree,3)}')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('XGB Regressor')
print(f'{perc:.4g}',f'{scoree:.4g}, r2={scorer}')
# linear model
reg = LinearRegression().fit(dataS[use], dataS[label])
ypr = reg.predict(datatest[use])
r2lin = r2(datatest[label],ypr)

### Create a Neural Network to compare performance with XGBoost Regression

In [None]:
mod = Sequential()
mod.add(Dense(3,input_shape=(Xtrain.shape[1],),activation='relu'))
mod.add(Dense(1,activation='linear'))
mod.summary()

In [None]:
mod.compile(optimizer='sgd', loss='mse')
history = mod.fit(Xtrain,ytrain,epochs = 350, verbose=True, \
                  validation_data=(Xtest,ytest),\
                  callbacks=[TqdmCallback(verbose=1), \
                             EarlyStopping(monitor = 'loss',patience=4)])

### Plot Learning History

In [None]:
plt.semilogy(history.history['loss'],label='loss')
plt.semilogy(history.history['val_loss'],label='val_loss')
plt.legend()
plt.show()
mod.save('Model_3var.h5')

In [None]:
mod = keras.models.load_model('Model_3var.h5')
yp = mod.predict(datatest[use])
unscaled = pd.DataFrame(s.inverse_transform(datatest),columns = alls)
datap = datatest.copy()
datap[label] = yp
unscaledy = unscaled[label]
unscaledp = pd.DataFrame(s.inverse_transform(datap),columns=alls)[label]
perc = MAPE(unscaledy, unscaledp)

scoree = MAE(unscaledy, unscaledp)
scorer = r2(unscaledy, unscaledp)
plt.plot(unscaledy, unscaledp, 'bo')
plt.xlabel('True')
print(f'{perc:.4g}',f'{scoree:.4g}, r2={scorer}')

### View Predictions

In [None]:
# XGB Regression
rpmmed = np.ones(datan.shape[0])*dataS[rpm].median()
speedmed = np.ones(datan.shape[0])*dataS[speed].median()
acelmed = np.ones(datan.shape[0])*dataS[acel].median()
print('RPM,Speed,Acceleration median values')
print(dataS[rpm].median(),dataS[speed].median(),dataS[acel].median())
plt.figure(figsize=(12,5))
plt.subplot(1,3,1)
sns.kdeplot(dataS[rpm])
plt.subplot(1,3,2)
sns.kdeplot(dataS[speed])
plt.subplot(1,3,3)
sns.kdeplot(dataS[acel])
plt.show()

## Display Extrapolations with one variable

In [None]:
# Use the XGBoost model since it performed better
n = datan.shape[0]
rpmvar = np.linspace(.5,1,n)
speedvar = np.linspace(0,1,n)
acelvar = np.linspace(.3,1,n)
rpmdf = pd.DataFrame(np.array([acelmed,speedmed,rpmvar]).T,columns=use)
speeddf = pd.DataFrame(np.array([acelmed,speedvar,rpmmed]).T,columns=use)
aceldf = pd.DataFrame(np.array([acelvar,speedmed,rpmmed]).T,columns=use)
rpmpred = m.predict(rpmdf)
speedpred = m.predict(speeddf)
acelpred = m.predict(aceldf)
trrpm = pd.DataFrame(s.inverse_transform(rpmdf.join(pd.DataFrame(rpmpred))),columns = use+[label])
trspeed = pd.DataFrame(s.inverse_transform(speeddf.join(pd.DataFrame(speedpred))),columns = use+[label])
tracel = pd.DataFrame(s.inverse_transform(aceldf.join(pd.DataFrame(acelpred))),columns = use+[label])

plt.figure(figsize=(12,5))
plt.subplot(1,3,1)
plt.plot(trrpm[rpm],trrpm[label])
plt.ylabel('MPG'); plt.xlabel('RPM')
plt.subplot(1,3,2)
plt.plot(trspeed[speed],trspeed[label])
plt.ylabel('MPG'); plt.xlabel('Speed (mi/hr)')
plt.subplot(1,3,3)
plt.plot(tracel[acel],tracel[label])
#tracel.plot(x=acel,y=label)
plt.ylabel('MPG'); plt.xlabel('Acceleration')
plt.show()

### Discussion

The three cross sections show the MPG varying a single variable while the other variables are held constant at a median value. This method inherently misses important behavior because the other values are restricted to the median. However, the trends are visible and most likely carry throughout portions of the 4-d model, even if the actual magnitudes vary. Some observations:

- The highest MPG (fuel efficiency) is at lower RPM for the engine. The data where the car is coasting may influence this higher trend at lower RPM.
- The highest MPG is at the highest speed with highway driving. City driving fuel efficiency is lower due to braking action and lower average velocity.
- Fuel efficiency decreases with higher acceleration.

### 4D Figure with 3D + Color

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
datass = dataS.copy()
datass[label] = m.predict(dataS[use])
datainv = pd.DataFrame(s.inverse_transform(datass),columns = use+[label])
img = ax.scatter(datainv[rpm],datainv[speed],datainv[acel],c=datainv[label],cmap = plt.hot())
fig.colorbar(img)
plt.xlabel('RPM')
plt.ylabel('Speed')
ax.set_zlabel('Acceleration')
ax.view_init(azim=34,elev = -51)
plt.show()