In [0]:
# Imports the necessary modules
import geohash2
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. Input the Training Data

In [0]:
# Reads the training file
while True:
    try:
        fname = input("Please enter the name of the TRAINING data file: ")
        traffic_data = pd.read_csv(fname)
        print("Training data successfully loaded.")
        break
    except:
        print("File does not exist! Please try again!")


Please enter the name of the TRAINING data file: /content/gdrive/My Drive/Grab/Traffic/training.csv
Training data successfully loaded.


# 2. Prepare Features in Training Set

In [0]:
def features(dataset):
    '''
    Takes a data set and pre-processes it to generate the features
    :Inputs: 
    dataset: a dataframe with each row describing a data point
    :Returns:
    feature_set: a dataframe with the features for training the model
    dataset.demand: a dataseries with the targets of the model
    '''
    
    # Convert time into consecutive time periods, with 0 denoting midnight, and 95 denoting 23:45
    time = []
    for row in dataset.timestamp:
        h,m = row.split(":")
        time.append(4 * int(h) + int(m) / 15)
        
    time_period = pd.Series(time)
    
    # Convert the day given in the data into a day of the week, from 0 to 6
    # This based on the assumption that demand will be similar for the same day of the week
    days_in_seven = []
    for row in dataset.day:
        days_in_seven.append(row%7)
    
    days_in_seven = pd.Series(days_in_seven)
    
    # Split the geohash into latitude and longitude, then combine them into "squares" of area on the map
    # Instead of treating latitude and longitude as independent of each other, assigning a value for each 
    # latitude-longitude pair and treating them as a single "area" feature is more meaningful.
    latitudes = []
    longitudes = []
    for row in dataset.geohash6:
        lat, long = geohash2.decode(row)
        latitudes.append(float(lat))
        longitudes.append(float(long))
    
    areas = []
    latitude_min = min(latitudes)
    longitude_min = min(longitudes)
    i_range = 100 * (max(latitudes) - min(latitudes)) + 1
    for i in range(len(latitudes)):
        area = 100 * i_range * (latitudes[i] - latitude_min) + 100 * (longitudes[i] - longitude_min)
        areas.append(area)
    
    areas = pd.Series(areas)
    
    feature_set = pd.DataFrame({'area': areas,
                                'day in seven': days_in_seven,
                                'time period': time_period})
    
    return feature_set, dataset.demand

In [0]:
# Generates the features (X) and targets (y) based on the training data
X, y = features(traffic_data)

# 3. Fit the Training Data

In [0]:
def fitting(X, y, validation=True,test_size=0.33, random_state=42):
    '''
    Takes a data set and fits it using a Random Forest model
    :Inputs:
    X, y : the data set to fit using the model. X represents the features and y represents the targets
    validation : if True, will split the data set into a training and validation set for evaluating the model
    test_size : if validation is True, will set aside this fraction of the data set for validation
    random_state : seed for the random state of the model and the training-validation split
    :Returns:
    model: a trained Random Forest Regressor model, trained with the features (X) and targets (y) input
    '''
    
    if validation:
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = test_size, random_state=random_state)
        model = RandomForestRegressor(random_state=random_state, n_estimators=100, min_samples_leaf=0.001, oob_score = True, bootstrap = True)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        score = np.sqrt(mean_squared_error(y_valid, y_pred))
        print('The score on the validation set is {}.'.format(score))
    else:
        model = RandomForestRegressor(random_state=random_state, n_estimators=100, min_samples_leaf=0.001, oob_score = True, bootstrap = True)
        model.fit(X, y)
    return model


In [0]:
# Trains the model with the training data (X) and targets (y)
traffic_model = fitting(X, y)

The score on the validation set is 0.13530557000274804.


# 4. Input the Test Data

In [0]:
# Reads the test file
while True:
    try:
        fname = input("Please enter the name of the TEST data file: ")
        traffic_data = pd.read_csv(fname)
        print("Test data successfully loaded.")
        break
    except:
        print("File does not exist! Please try again!")

Please enter the name of the TEST data file: /content/gdrive/My Drive/Grab/Traffic/training.csv
Test data successfully loaded.


In [0]:
# Generates features (X_test) and targets (y_test) for test data
X_test, y_test = features(traffic_data)

# Uses the trained model to predict the output based on the test features, 
# then generates a score based on the root-mean-squared-error of the difference 
# between the actual targets and the predicted targets
y_test_pred = traffic_model.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("The score on the test set is {}".format(score))


The score on the test set is 0.13543015396325933
