# Predict

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
ori_data = pd.read_csv('./data/v3.csv', index_col=False)

In [3]:
ori_data.columns

Index(['Twilight', 'Severity', 'Start_Lat', 'Start_Lng', 'Distance(mi)',
       'Street', 'City', 'County', 'State', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Hour', 'Minute', 'Day', 'Month',
       'Weekday', 'Year', 'elapsed_time'],
      dtype='object')

In [4]:
ori_data.head(1)

Unnamed: 0,Twilight,Severity,Start_Lat,Start_Lng,Distance(mi),Street,City,County,State,Temperature(F),...,Stop,Traffic_Calming,Traffic_Signal,Hour,Minute,Day,Month,Weekday,Year,elapsed_time
0,0,4,40.630609,-75.470606,0.061,309187,11881,882,36,31.0,...,0,0,0,20,18,14,1,3,2016,616470.5


In [5]:
X = ori_data.drop(['Severity', 'Start_Lat', 'Start_Lng', 'Hour', 'Minute', 'Hour', 'Weekday', 'Year', 'Month'], axis=1)
y = ori_data['Severity']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=99)

In [6]:
print(f'Length of training data: {len(x_train)}, and its distribution among each severity {Counter(y_train)}')
# print(f'Length of validation data: {len(x_valid)}, and its distribution among each severity {Counter(y_valid)}')
print(f'Length of testing data: {len(x_test)}, and its distribution among each severity {Counter(y_test)}')

Length of training data: 5960350, and its distribution among each severity Counter({2: 4754033, 3: 1002162, 4: 151561, 1: 52594})
Length of testing data: 1490088, and its distribution among each severity Counter({2: 1188776, 3: 250247, 4: 38034, 1: 13031})


In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

x_res, y_res = rus.fit_resample(x_train, y_train)
print('After under sampling:')
print(f'Length of training data: {len(x_res)}, and its distribution among each severity {Counter(y_res)}')

After under sampling:
Length of training data: 210376, and its distribution among each severity Counter({1: 52594, 2: 52594, 3: 52594, 4: 52594})


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

dt = RandomForestClassifier()

def custom_scoring(y_true, y_pred):
    weights = {
        1: 10,
        2: 1,
        3: 1,
        4: 10
    }
    weighted_f1 = f1_score(y_true, y_pred, average='weighted', labels=[1, 2, 3, 4], sample_weight=[weights[label] for label in y_true])
    return weighted_f1

# Define the hyperparameter grid to search
param_grid = {
    'criterion': ['gini'],
    'n_estimators': [100, 200, 300],
    'max_depth': [15, 25, None],
    # 'min_samples_split': [2, 5],
    # 'min_samples_leaf': [1, 2, 4]
}

score = make_scorer(custom_scoring)

# Use GridSearchCV to perform the search
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=score)
grid_search.fit(x_res, y_res)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
dt = DecisionTreeClassifier(max_depth=15)
dt.fit(x_res, y_res)
y_pred = dt.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.07      0.84      0.12     13031
           2       0.95      0.57      0.71   1188776
           3       0.47      0.72      0.57    250247
           4       0.12      0.76      0.21     38034

    accuracy                           0.60   1490088
   macro avg       0.40      0.72      0.40   1490088
weighted avg       0.84      0.60      0.67   1490088



In [None]:
clf = RandomForestClassifier()
clf.fit(x_res, y_res)
y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.25      0.96      0.40     13031
           2       0.97      0.68      0.80   1188776
           3       0.52      0.80      0.63    250247
           4       0.15      0.86      0.25     38034

    accuracy                           0.71   1490088
   macro avg       0.47      0.83      0.52   1490088
weighted avg       0.87      0.71      0.76   1490088



In [None]:
abc = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=5), n_estimators=100)
abc.fit(x_res, y_res)
y_pred = abc.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.21      0.93      0.35     13031
           2       0.93      0.52      0.66   1188776
           3       0.49      0.69      0.57    250247
           4       0.07      0.77      0.13     38034

    accuracy                           0.55   1490088
   macro avg       0.43      0.73      0.43   1490088
weighted avg       0.83      0.55      0.63   1490088



In [None]:
estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('svr', make_pipeline(StandardScaler(), LinearSVC(dual="auto", random_state=42)))
]
sclf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
sclf.fit(x_res, y_res)
y_pred = abc.predict(x_test)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.21      0.93      0.35     13031
           2       0.93      0.52      0.66   1188776
           3       0.49      0.69      0.57    250247
           4       0.07      0.77      0.13     38034

    accuracy                           0.55   1490088
   macro avg       0.43      0.73      0.43   1490088
weighted avg       0.83      0.55      0.63   1490088



In [None]:
svm = make_pipeline(StandardScaler(), LinearSVC(dual="auto", random_state=42, max_iter=50000))
svm.fit(x_res, y_res)
y_pred = svm.predict(x_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.02      0.58      0.03     13031
           2       0.84      0.04      0.08   1188776
           3       0.23      0.49      0.31    250247
           4       0.04      0.51      0.08     38034

    accuracy                           0.13   1490088
   macro avg       0.28      0.40      0.13   1490088
weighted avg       0.71      0.13      0.12   1490088



In [None]:
print(y_pred)

[2 1 3 ... 4 3 3]
