## Random Forest Method

### Load dataset

In [1]:
dataset_directory="/home/jychen630/socity/dataset/dataset_raw.csv"

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(dataset_directory)
df = df.head(len(df)//4)
label_name = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
label_to_encoding = {s : i + 1 for i, s in enumerate(label_name)}
encoding_to_label = {i+1 : s for i, s in enumerate(label_name)}
df['label_encoding'] = df['label_detail'].map(label_to_encoding)
df = df[['latitude', 'longitude', 'altitude',
       'date_time', 'label_encoding', 'id_user', 'id_route', 
       'label_detail', 'label' ,'acc_x', 'acc_y', 'acc_z', 
       'gyro_x', 'gyro_y', 'gyro_z', 'mag_x','mag_y', 'mag_z', 'b_pres']]


### Feature Extraction

In [3]:
# Calculate distance between consecutive coordinates
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # Convert coordinates to radians using NumPy vectorize
    lat1_rad = np.deg2rad(lat1)
    lon1_rad = np.deg2rad(lon1)
    lat2_rad = np.deg2rad(lat2)
    lon2_rad = np.deg2rad(lon2)

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c

    return distance

# Calculate bearing (direction) between consecutive coordinates
def calculate_bearing(lat1, lon1, lat2, lon2):
    delta_lon = np.radians(lon2 - lon1)
    lat1, lat2 = np.radians(lat1), np.radians(lat2)

    y = np.sin(delta_lon) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(delta_lon)

    bearing = np.degrees(np.arctan2(y, x))
    bearing = (bearing + 360) % 360

    return bearing


df["date_time"] = pd.to_datetime(df["date_time"], format="%Y-%m-%d %H:%M:%S")
# Calculate time differences between consecutive rows
df['time_diff'] = df['date_time'].diff().dt.total_seconds()
df['distance'] = calculate_distance(df['latitude'].shift().values, df['longitude'].shift().values, df['latitude'].values, df['longitude'].values)

# Calculate speed as distance divided by time difference
df['speed'] = df['distance'] / df['time_diff']

# Calculate acceleration as speed difference divided by time difference
df['acceleration'] = df['speed'].diff() / df['time_diff']
df['bearing'] = calculate_bearing(df['latitude'].shift().values, df['longitude'].shift().values, df['latitude'].values, df['longitude'].values)

df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['hour'] = df['date_time'].dt.hour
df.head()


Unnamed: 0,latitude,longitude,altitude,date_time,label_encoding,id_user,id_route,label_detail,label,acc_x,...,mag_z,b_pres,time_diff,distance,speed,acceleration,bearing,month,day,hour
0,41.741415,86.186028,-777.0,2008-03-31 16:00:08,11,10,20080331160008.plt,taxi,car/taxi,0,...,0,0,,,,,,3,31,16
1,41.737063,86.17947,-777.0,2008-03-31 16:01:07,11,10,20080331160008.plt,taxi,car/taxi,0,...,0,0,59.0,0.728186,0.012342,,228.353905,3,31,16
2,41.734105,86.172823,-777.0,2008-03-31 16:02:07,11,10,20080331160008.plt,taxi,car/taxi,0,...,0,0,60.0,0.642173,0.010703,-2.7e-05,239.192386,3,31,16
3,41.73911,86.166563,-777.0,2008-03-31 16:03:06,11,10,20080331160008.plt,taxi,car/taxi,0,...,0,0,59.0,0.761267,0.012903,3.7e-05,316.977262,3,31,16
4,41.744368,86.159987,-777.0,2008-03-31 16:04:05,11,10,20080331160008.plt,taxi,car/taxi,0,...,0,0,59.0,0.799694,0.013554,1.1e-05,316.981561,3,31,16


In [4]:
target_string_counts = df['label_detail'].value_counts()
target_encoding_counts = df['label_encoding'].value_counts()

print(target_string_counts)
print(target_encoding_counts)

label_detail
bus       249453
walk      179083
bike      154186
train     129004
taxi       81598
subway     40062
car        11092
run            8
Name: count, dtype: int64
label_encoding
3     249453
1     179083
2     154186
6     129004
11     81598
5      40062
4      11092
9          8
Name: count, dtype: int64


In [5]:
# Dropping rows with nan is the last step before moving on to training
print(f"Length before drop na: {len(df)}")
df = df.replace([np.inf, -np.inf], np.nan).dropna()
print(f"Length after drop na: {len(df)}")

Length before drop na: 844486
Length after drop na: 726765


In [6]:
df_without_plagarism = df.drop(['label', 'label_detail', 'date_time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z',
       'mag_x', 'mag_y', 'mag_z', 'b_pres', 'id_user', 'id_route'], axis=1)

df_without_plagarism.columns

Index(['latitude', 'longitude', 'altitude', 'label_encoding', 'time_diff',
       'distance', 'speed', 'acceleration', 'bearing', 'month', 'day', 'hour'],
      dtype='object')

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define features (X) and target variable (y)
X = df_without_plagarism.drop('label_encoding', axis=1) #.drop(['label', 'time'], axis=1)  # Exclude 'label' and 'time' columns
y = df_without_plagarism['label_encoding']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(verbose=True)
rf_classifier.fit(X_train, y_train)
print('done fitting')
# Predict on the test set
y_pred = rf_classifier.predict(X_test)
print('done inference')
# Evaluate the model
accuracy = rf_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.8min


done fitting


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.8s


done inference


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.8s


Accuracy: 0.9685868196734846


In [8]:
result = pd.DataFrame({'expected_encoding': y_test, 'predicted_encoding': y_pred})
result['expected_mode'] = result.expected_encoding.map(encoding_to_label)
result['predicted_mode'] = result.predicted_encoding.map(encoding_to_label)
result = pd.concat([result, X_test], axis=1)
result

Unnamed: 0,expected_encoding,predicted_encoding,expected_mode,predicted_mode,latitude,longitude,altitude,time_diff,distance,speed,acceleration,bearing,month,day,hour
379023,2,2,bike,bike,39.979953,116.322772,0.0,1.0,0.002755,0.002755,-0.000965,258.356806,11,25,1
165106,6,6,train,train,35.108513,93.043011,15443.0,1.0,0.028497,0.028497,0.000548,190.483343,9,30,23
289870,5,5,subway,subway,40.356063,116.003135,0.0,1.0,0.000000,0.000000,-0.000397,0.000000,9,11,4
560176,3,3,bus,bus,39.988482,116.394123,183.7,3.0,0.005819,0.001940,0.000042,241.461430,8,15,21
538395,3,3,bus,bus,40.034245,116.467018,131.2,2.0,0.018437,0.009218,0.000040,224.431682,8,5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43536,3,3,bus,bus,32.197690,119.381837,48.0,1.0,0.015959,0.015959,-0.000206,47.478380,5,18,7
243731,1,1,walk,walk,40.260821,116.190978,351.0,2.0,0.002885,0.001443,-0.000192,89.999989,11,7,10
516270,3,3,bus,bus,39.966340,116.329938,196.9,2.0,0.021451,0.010726,-0.000177,269.495176,7,29,14
18434,6,6,train,train,42.652773,86.269078,-777.0,1.0,0.018859,0.018859,-0.000272,27.086188,4,3,21


In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.96      0.96      0.96     33078
           2       0.99      0.99      0.99     17011
           3       0.99      0.99      0.99     49865
           4       0.66      0.64      0.65      2215
           5       0.65      0.66      0.66      4938
           6       1.00      1.00      1.00     25937
           9       1.00      0.67      0.80         3
          11       1.00      0.99      1.00     12306

    accuracy                           0.97    145353
   macro avg       0.91      0.86      0.88    145353
weighted avg       0.97      0.97      0.97    145353

