### Libraries

In [2]:
import pandas as pd
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

### Cleaned Dataframe

In [None]:
url = "https://drive.google.com/file/d/16aOg3J57O1Gmx_7LSIMt-hPmMxYXtDH-/view?usp=sharing"
gdrive_df = pd.read_csv('https://drive.usercontent.google.com/download?id={}&export=download&authuser=0&confirm=t'.format(url.split('/')[-2]), index_col=[0], converters={'shape': pd.eval})
gdrive_df.head()

### Read Merged Dataset

In [None]:
new_df = pd.read_csv("merged_jakarta_traffic.csv")
new_df.head()

### Exploratory Data Analysis

In [None]:
df_eda = gdrive_df.copy()

def change_day_to_num(x):
    dayDict = {
        'MONDAY': 0,
        'TUESDAY': 1, 
        'WEDNESDAY': 2,
        'THURSDAY': 3,
        'FRIDAY': 4,
        'SATURDAY': 5,
        'SUNDAY':6
    }

    return dayDict[x]

def change_num_to_day(x):
    dayDict = {
        0: 'MONDAY', 
        1: 'TUESDAY', 
        2: 'WEDNESDAY', 
        3: 'THURSDAY', 
        4: 'FRIDAY', 
        5: 'SATURDAY', 
        6: 'SUNDAY'
    }

    return dayDict[x]

df_eda['dayCode'] = df_eda['day'].apply(lambda x: change_day_to_num(x))

df_eda.head(5)

### Data Preparation

In [14]:
new_df.drop(columns='Unnamed: 0', inplace=True)

In [15]:
new_df.head()

Unnamed: 0,day,hour_interval,segmentId,speedLimit,frc,streetName,distance,shape,harmonicAverageSpeed,medianSpeed,averageSpeed,standardDeviationSpeed,travelTimeStandardDeviation,sampleSize,averageTravelTime,medianTravelTime,travelTimeRatio
0,MONDAY,00:00-02:00,-13600003425517,18,2,Jalan Trans Yogie,16.33,"[{'latitude': -6.3757, 'longitude': 106.90701}...",31.52,24.8,57.6,59.87,1.23,3,1.86,2.37,1.0
1,MONDAY,00:00-02:00,-13600001618936,50,5,Jalan Karya Bakti,34.33,"[{'latitude': -6.36841, 'longitude': 106.89396...",28.1,28.1,28.1,0.0,0.0,1,4.4,4.4,1.0
2,MONDAY,00:00-02:00,-13600002440919,50,5,Jalan Karya Bakti,10.44,"[{'latitude': -6.36829, 'longitude': 106.89368...",27.5,27.5,27.5,0.0,0.0,1,1.37,1.37,1.0
3,MONDAY,00:00-02:00,-13600001175418,50,5,Jalan Karya Bakti,284.25,"[{'latitude': -6.36825, 'longitude': 106.89359...",27.6,27.6,27.6,0.0,0.0,1,37.08,37.08,1.0
4,MONDAY,00:00-02:00,-13600001957379,50,5,Jalan Karya Bakti,93.0,"[{'latitude': -6.36731, 'longitude': 106.8912}...",33.53,31.9,35.23,10.02,2.57,3,9.99,10.5,1.0


#### Extract latitudes and longitues from 'shape'

In [17]:
def extract_lat_lon(shape):
    # Convert the string to a list of dictionaries
    points = json.loads(shape.replace("'", "\""))
    # Extract latitudes and longitudes
    lat1, lon1 = points[0]['latitude'], points[0]['longitude']
    lat2, lon2 = points[1]['latitude'], points[1]['longitude']
    return lat1, lon1, lat2, lon2

# Apply the function to the 'shape' column
new_df[['lat1', 'lon1', 'lat2', 'lon2']] = new_df['shape'].apply(lambda x: pd.Series(extract_lat_lon(x)))

In [18]:
new_df.to_csv("merged_extracted_coords_jakarta_traffic.csv")

In [20]:
a = pd.read_csv("merged_extracted_coords_jakarta_traffic.csv")
a.drop(columns=['Unnamed: 0', 'shape'], inplace=True)

In [21]:
a.to_csv("extracted_coords_jakarta_traffic.csv")

### Data Preprocessing

In [5]:
df = pd.read_csv("../Data-Mining-Jakarta-Traffic (local)/extracted_coords_jakarta_traffic.csv", index_col=[0])
df.head()

Unnamed: 0,day,hour_interval,segmentId,speedLimit,frc,streetName,distance,harmonicAverageSpeed,medianSpeed,averageSpeed,standardDeviationSpeed,travelTimeStandardDeviation,sampleSize,averageTravelTime,medianTravelTime,travelTimeRatio,lat1,lon1,lat2,lon2
0,MONDAY,00:00-02:00,-13600003425517,18,2,Jalan Trans Yogie,16.33,31.52,24.8,57.6,59.87,1.23,3,1.86,2.37,1.0,-6.3757,106.90701,-6.37555,106.90698
1,MONDAY,00:00-02:00,-13600001618936,50,5,Jalan Karya Bakti,34.33,28.1,28.1,28.1,0.0,0.0,1,4.4,4.4,1.0,-6.36841,106.89396,-6.36829,106.89368
2,MONDAY,00:00-02:00,-13600002440919,50,5,Jalan Karya Bakti,10.44,27.5,27.5,27.5,0.0,0.0,1,1.37,1.37,1.0,-6.36829,106.89368,-6.36825,106.89359
3,MONDAY,00:00-02:00,-13600001175418,50,5,Jalan Karya Bakti,284.25,27.6,27.6,27.6,0.0,0.0,1,37.08,37.08,1.0,-6.36825,106.89359,-6.36731,106.8912
4,MONDAY,00:00-02:00,-13600001957379,50,5,Jalan Karya Bakti,93.0,33.53,31.9,35.23,10.02,2.57,3,9.99,10.5,1.0,-6.36731,106.8912,-6.367,106.89042


In [6]:
df.shape

(11396536, 20)

#### Calculating Midpoint

In [7]:
df['mid_lat'] = (df['lat1'] + df['lat2']) / 2
df['mid_lon'] = (df['lon1'] + df['lon2']) / 2

#### Calculated Midpoints for Geographical Binning

In [8]:
grid_size = 0.1

# Apply grid-based binning using midpoints
df['lat_bin'] = np.floor(df['mid_lat'] / grid_size) * grid_size
df['lon_bin'] = np.floor(df['mid_lon'] / grid_size) * grid_size

#### Geographical Clustering

In [9]:
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(df[['lat_bin', 'lon_bin']])

In [10]:
df['cluster'].value_counts()

0    4568110
2    3240625
1    1593223
3    1292440
4     702138
Name: cluster, dtype: int64

In [11]:
encoder = OneHotEncoder(sparse=False)
encoded_time_features = encoder.fit_transform(df[['day', 'hour_interval']])

In [12]:
encoded_time_df = pd.DataFrame(encoded_time_features, columns=encoder.get_feature_names_out(['day', 'hour_interval']))

df_encoded = pd.concat([df.drop(['day', 'hour_interval'], axis=1), encoded_time_df], axis=1)
df_encoded.head()

Unnamed: 0,segmentId,speedLimit,frc,streetName,distance,harmonicAverageSpeed,medianSpeed,averageSpeed,standardDeviationSpeed,travelTimeStandardDeviation,...,hour_interval_04:00-06:00,hour_interval_06:00-08:00,hour_interval_08:00-10:00,hour_interval_10:00-12:00,hour_interval_12:00-14:00,hour_interval_14:00-16:00,hour_interval_16:00-18:00,hour_interval_18:00-20:00,hour_interval_20:00-22:00,hour_interval_22:00-23:59
0,-13600003425517,18,2,Jalan Trans Yogie,16.33,31.52,24.8,57.6,59.87,1.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-13600001618936,50,5,Jalan Karya Bakti,34.33,28.1,28.1,28.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-13600002440919,50,5,Jalan Karya Bakti,10.44,27.5,27.5,27.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-13600001175418,50,5,Jalan Karya Bakti,284.25,27.6,27.6,27.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-13600001957379,50,5,Jalan Karya Bakti,93.0,33.53,31.9,35.23,10.02,2.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model

#### Linear Regression

In [13]:
features = df_encoded.drop(['streetName', 'segmentId', 'sampleSize', 'averageTravelTime', 'lat1', 'lon1',
       'lat2', 'lon2', 'mid_lat', 'mid_lon', 'lat_bin', 'lon_bin'], axis=1)  
target = df_encoded['averageTravelTime']

In [14]:
lr_model = LinearRegression()

In [15]:
lr_scores = cross_val_score(lr_model, features, target, cv=5, scoring='r2')  # 5-fold cross-validation

# Output the cross-validation lr_scores
print(f'Cross-Validation R^2 Scores: {lr_scores}')
print(f'Average R^2 Score: {lr_scores.mean()}')

Cross-Validation R^2 Scores: [0.97215824 0.96784629 0.97043748 0.974274   0.96765392]
Average R^2 Score: 0.9704739886921173


In [16]:
lr_model.fit(features, target)
coefficients = lr_model.coef_
feature_importances = pd.DataFrame(coefficients, index=features.columns, columns=['importance'])

# Sort feature importances
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances)

                               importance
hour_interval_02:00-04:00    4.188262e+06
hour_interval_16:00-18:00    4.188261e+06
hour_interval_00:00-02:00    4.188261e+06
hour_interval_04:00-06:00    4.188261e+06
hour_interval_06:00-08:00    4.188261e+06
hour_interval_18:00-20:00    4.188261e+06
hour_interval_14:00-16:00    4.188261e+06
hour_interval_10:00-12:00    4.188261e+06
hour_interval_08:00-10:00    4.188261e+06
hour_interval_12:00-14:00    4.188261e+06
hour_interval_22:00-23:59    4.188261e+06
hour_interval_20:00-22:00    4.188261e+06
day_WEDNESDAY                2.626174e+06
day_FRIDAY                   2.626174e+06
day_TUESDAY                  2.626174e+06
day_THURSDAY                 2.626174e+06
day_MONDAY                   2.626174e+06
day_SATURDAY                 2.626174e+06
day_SUNDAY                   2.626174e+06
medianTravelTime             8.207117e-01
medianSpeed                  4.125483e-01
travelTimeStandardDeviation  3.003224e-01
standardDeviationSpeed       7.097

#### SVM

In [17]:
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

svm_scores = cross_val_score(svm_model, features, target, cv=5, scoring='r2')

print(f'Cross-Validation R^2 Scores: {svm_scores}')
print(f'Average R^2 Score: {svm_scores.mean()}')

#### MLP

In [None]:
mlp_model = MLPRegressor(hidden_layer_sizes=(100,100), max_iter=1000, random_state=0)
mlp_scores = cross_val_score(mlp_model, features, target, cv=5, scoring='r2')

print(f'Cross-Validation R^2 Scores: {mlp_scores}')
print(f'Average R^2 Score: {mlp_scores.mean()}')

NameError: name 'features' is not defined