In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

In [2]:
dataset = pd.read_csv("norms_clustering_3_clusters_segments_1000M.csv")

In [3]:
dataset.columns

Index(['index', 'segment_id', 'road_section_id', 'segment_starting_time',
       'segment_ending_time', 'trip_id', 'deviceid', 'date', 'start_terminal',
       'end_terminal', 'direction', 'day_of_week', 'hour_of_day', 'speed_mean',
       'speed_max', 'speed_std', 'elevation_p', 'elevation_n', 'ele_X_speed_p',
       'ele_X_speed_n', 'max_elevation_p', 'max_elevation_n',
       'max_ele_X_speed_p', 'max_ele_X_speed_n', 'eleXspeed_count_p',
       'eleXspeed_count_n', 'average_acceleration', 'average_deacceleration',
       'std_acc_dacc', 'max_acceleration', 'max_deacceleration',
       'no_data_points', 'no_acc_points', 'no_deacc_points', 'stop_count',
       'cluster'],
      dtype='object')

In [4]:
dataset.drop(columns=['index', 'segment_starting_time', 'segment_ending_time', 'trip_id', 'date', 'end_terminal', 'start_terminal', 'average_acceleration', 'average_deacceleration', 'std_acc_dacc', 'max_acceleration', 'max_deacceleration', 'no_data_points', 'no_acc_points', 'no_deacc_points'], inplace=True)

In [5]:
dataset.columns

Index(['segment_id', 'road_section_id', 'deviceid', 'direction', 'day_of_week',
       'hour_of_day', 'speed_mean', 'speed_max', 'speed_std', 'elevation_p',
       'elevation_n', 'ele_X_speed_p', 'ele_X_speed_n', 'max_elevation_p',
       'max_elevation_n', 'max_ele_X_speed_p', 'max_ele_X_speed_n',
       'eleXspeed_count_p', 'eleXspeed_count_n', 'stop_count', 'cluster'],
      dtype='object')

In [6]:
dataset.drop(columns=['speed_mean', 'speed_max', 'speed_std', 'elevation_p',
       'elevation_n', 'ele_X_speed_p', 'ele_X_speed_n', 'max_elevation_p',
       'max_elevation_n', 'max_ele_X_speed_p', 'max_ele_X_speed_n',
       'eleXspeed_count_p', 'eleXspeed_count_n', 'stop_count',], inplace=True)

In [7]:
dataset

Unnamed: 0,segment_id,road_section_id,deviceid,direction,day_of_week,hour_of_day,cluster
0,1,-17,116,2,5,7,2
1,2,-16,116,2,5,7,0
2,3,-15,116,2,5,7,0
3,4,-14,116,2,5,7,0
4,5,-13,116,2,5,7,1
...,...,...,...,...,...,...,...
281191,287398,-5,1377,2,1,17,1
281192,287399,-4,1377,2,1,17,1
281193,287400,-3,1377,2,1,17,0
281194,287401,-2,1377,2,1,17,1


In [8]:
train = dataset.drop(columns=['segment_id'])

In [9]:
train

Unnamed: 0,road_section_id,deviceid,direction,day_of_week,hour_of_day,cluster
0,-17,116,2,5,7,2
1,-16,116,2,5,7,0
2,-15,116,2,5,7,0
3,-14,116,2,5,7,0
4,-13,116,2,5,7,1
...,...,...,...,...,...,...
281191,-5,1377,2,1,17,1
281192,-4,1377,2,1,17,1
281193,-3,1377,2,1,17,0
281194,-2,1377,2,1,17,1


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( train.drop(columns=['cluster']), train['cluster'], test_size=0.33, random_state=42)

In [16]:
rf_params = {"bootstrap": [False, True],
             "n_estimators": [60, 70, 80, 90, 100],
             "max_features": [0.6, 0.65, 0.7, 0.75, 0.8],
             "min_samples_leaf": [8, 10, 12, 14],
             "min_samples_split": [3, 5, 7]
        }

rf_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_params, scoring="accuracy", n_iter=100)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_

In [18]:
rf =best_rf

In [22]:
best_rf

In [None]:
RandomForestClassifier(max_features=0.65, min_samples_leaf=14,
                       min_samples_split=3, n_estimators=80)

In [21]:
rf_prediction = best_rf.predict(X_test)
metrics.accuracy_score(y_test, rf_prediction)

0.5757098981626165

In [23]:
rf = RandomForestClassifier(max_features=0.65, min_samples_leaf=14,min_samples_split=3, n_estimators=1000)
rf.fit(X_train, y_train)
rf_prediction = rf.predict(X_test)
metrics.accuracy_score(y_test, rf_prediction)

0.5763672611670887

In [13]:
import xgboost as xgb
# Instantiate an XGBoost classifier object
clf = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, seed=42)

# Train the classifier
clf.fit(X_train, y_train)
# Predictions
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))
print("Accuracy: %.2f%%" % (accuracy * 100.0))

              precision    recall  f1-score   support

           0       0.61      0.34      0.44     20988
           1       0.60      0.77      0.67     45390
           2       0.52      0.45      0.48     26417

    accuracy                           0.58     92795
   macro avg       0.58      0.52      0.53     92795
weighted avg       0.58      0.58      0.57     92795

Accuracy: 58.10%


In [26]:
clf.feature_importances_

array([0.13420066, 0.67424333, 0.        , 0.04570119, 0.1458548 ],
      dtype=float32)

In [29]:
params = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 2, 3, 4],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'learning_rate': [0.01, 0.02, 0.05, 0.1],
    'n_estimators': [100, 150, 200, 250]
}
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Setup the Randomized Search with cross-validation
random_search = RandomizedSearchCV(xgb_clf, param_distributions=params, n_iter=100, cv=3, verbose=2, random_state=42, scoring='accuracy')

# Fit Randomized Search
random_search.fit(X_train, y_train)
best_clf = random_search.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.02, max_depth=3, min_child_weight=4, n_estimators=100, subsample=0.9; total time=   2.6s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.02, max_depth=3, min_child_weight=4, n_estimators=100, subsample=0.9; total time=   2.3s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.02, max_depth=3, min_child_weight=4, n_estimators=100, subsample=0.9; total time=   2.3s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.05, max_depth=4, min_child_weight=2, n_estimators=250, subsample=0.6; total time=   6.3s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.05, max_depth=4, min_child_weight=2, n_estimators=250, subsample=0.6; total time=   8.1s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.05, max_depth=4, min_child_weight=2, n_estimators=250, subsample=0.6; total time=   8.1s
[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.1, max_de

In [30]:
best_clf

In [31]:
rf_prediction = best_clf.predict(X_test)
metrics.accuracy_score(y_test, rf_prediction)

0.5822943046500351

In [None]:
metrics.f