In [1]:
#Klassifikation Meht3 LGBM-Classifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pandas as pd


In [2]:
df_accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')

In [3]:
df_accidents.dtypes

Unnamed: 0                 int64
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Temperature(F)           float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout

In [4]:
#featureselection
#features to encode
obj_bool_features = df_accidents[['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                         'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
                         'Sunrise_Sunset']]

In [5]:
#encode features above
le = LabelEncoder()
#create copy of df
df_encoded = pd.DataFrame()
#LabelEncoder can only encode one column at a time --> forloop
#obj_bool_feat = le.fit_transform(obj_bool_features)
for feature in obj_bool_features:
    df_encoded[feature] = le.fit_transform(df_accidents[feature])

In [6]:
#float/int features
float_int_features = df_accidents[['Severity','Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',]]

for feature in float_int_features:
    df_encoded[feature] = df_accidents[feature]

#check if combination worked 
df_encoded.columns

Index(['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset',
       'Severity', 'Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)'],
      dtype='object')

In [7]:
#check if encoding worked 
print(df_encoded)

         Weather_Condition  Amenity  Bump  Crossing  Give_Way  Junction  \
0                       86        0     0         0         0         0   
1                       83        0     0         0         0         0   
2                       83        0     0         0         0         0   
3                       60        0     0         0         0         0   
4                       86        0     0         0         0         0   
...                    ...      ...   ...       ...       ...       ...   
7051551                 15        0     0         0         0         0   
7051552                 15        0     0         0         0         0   
7051553                 89        0     0         0         0         1   
7051554                 15        0     0         0         0         0   
7051555                 15        0     0         0         0         0   

         No_Exit  Railway  Roundabout  Station  ...  Turning_Loop  \
0              0        0     

In [8]:
#check amount of severity for each level
print(df_encoded['Severity'].value_counts())

Severity
2    5671502
3    1136465
4     178538
1      65051
Name: count, dtype: int64


In [9]:
#set Target
target = df_encoded['Severity']
#set features
features = df_encoded.drop(columns=['Severity'])
#scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [10]:
#Split Data 80/20
features_train, features_test, target_train, target_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

In [11]:
#lgbm-classifier
lgbm_classifier = LGBMClassifier(n_estimator = 50, random_state=42)

#train classifier
lgbm_classifier.fit(features_train, target_train)

#predict
target_predict = lgbm_classifier.predict(features_test)

found 0 physical cores < 1
  File "C:\Users\forte\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.171328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 5641244, number of used features: 21
[LightGBM] [Info] Start training from score -4.687059
[LightGBM] [Info] Start training from score -0.217710
[LightGBM] [Info] Start training from score -1.825832
[LightGBM] [Info] Start training from score -3.675211


In [12]:
accuracy = accuracy_score(target_test, target_predict)
print('Accuracy= ', {accuracy})
print(classification_report(target_test, target_predict))
print(confusion_matrix(target_test, target_predict))

Accuracy=  {0.8168547101634248}
              precision    recall  f1-score   support

           1       0.40      0.03      0.05     13074
           2       0.83      0.98      0.90   1133919
           3       0.62      0.18      0.28    227753
           4       0.66      0.01      0.01     35566

    accuracy                           0.82   1410312
   macro avg       0.63      0.30      0.31   1410312
weighted avg       0.79      0.82      0.77   1410312

[[    337   12670      67       0]
 [    445 1110730   22668      76]
 [     39  186958   40704      52]
 [     18   32920    2379     249]]
