In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [38]:
# Step 1: Read the data
data = pd.read_csv('homework_exampledata.csv', delimiter=';')

In [39]:
# clean-up data from nan-values
data = data.dropna()

In [40]:
# show the shape of the data
data.shape

(49977, 6)

In [41]:
# Step 2: Pre-process the data
# Convert time to a relative time (it's not strictly necessary but sometimes helps)
data['time'] = (data['time'] - data['time'].min())

In [42]:
# Step 3: Split the dataset
features = data[['time', 'brake-value', 'yaw-value', 'longitudinal-acceleration', 'lateral-acceleration']]
labels = data['velocity-value']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [43]:
# shape of training set
X_train.shape, y_train.shape

((39981, 5), (39981,))

In [44]:
# shape of a test set
X_test.shape, y_test.shape

((9996, 5), (9996,))

In [45]:
# Step 4: Train a regression model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [46]:
# Step 5: Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 322.73665978714394


# Do feature scaling and apply random forest

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [48]:
# Create a model with feature scaling and Random Forest Regressor
model = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(n_estimators=100, random_state=42)
)

In [49]:
# Fit the model
model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [50]:
# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error with Random Forest: {mse}')

Mean Squared Error with Random Forest: 6.122983821758704


# Grid Search to improve the random forest regressor

In [51]:
from sklearn.model_selection import GridSearchCV

In [52]:
param_grid = {
    'randomforestregressor__n_estimators': [50, 100, 200],
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth' : [4, 5, 6, 7, 8],
    'randomforestregressor__criterion' :['mse', 'mae']
}

In [53]:
# print which model I use currently 
display(model)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [None]:
# Obtain the best model
CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1) # n_jobs=-1, use all processors
CV_model.fit(X_train, y_train)
print(CV_model.best_params_)

In [None]:
# Evaluate the model with best parameters
predictions = CV_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error with Tuned Random Forest: {mse}')