In [1]:
from sklearn import svm, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMClassifier
import pandas as pd

In [2]:
accidents = pd.read_csv('../../data/MA3_finished.csv', index_col='ID')

In [3]:
accidents.dtypes

Unnamed: 0                 int64
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Temperature(F)           float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout

In [4]:
#featureselection
#features to encode
obj_bool_features = accidents[['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                         'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
                         'Sunrise_Sunset']]

#encode features above
le = LabelEncoder()
#create copy of df
accidents_encoded = pd.DataFrame()
#LabelEncoder can only encode one column at a time --> forloop
#obj_bool_feat = le.fit_transform(obj_bool_features)
for X in obj_bool_features:
    accidents_encoded[X] = le.fit_transform(accidents[X])

#float/int features
float_int_features = accidents[['Severity','Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)', 
                                'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',]]

for X in float_int_features:
    accidents_encoded[X] = accidents[X]

#check if combination worked 
accidents_encoded.columns

Index(['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset',
       'Severity', 'Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)'],
      dtype='object')

In [5]:
#check if encoding worked 
print(accidents_encoded)

         Weather_Condition  Amenity  Bump  Crossing  Give_Way  Junction  \
0                       86        0     0         0         0         0   
1                       83        0     0         0         0         0   
2                       83        0     0         0         0         0   
3                       60        0     0         0         0         0   
4                       86        0     0         0         0         0   
...                    ...      ...   ...       ...       ...       ...   
7051551                 15        0     0         0         0         0   
7051552                 15        0     0         0         0         0   
7051553                 89        0     0         0         0         1   
7051554                 15        0     0         0         0         0   
7051555                 15        0     0         0         0         0   

         No_Exit  Railway  Roundabout  Station  ...  Turning_Loop  \
0              0        0     

In [6]:
#check amount of severity for each level
print(accidents_encoded['Severity'].value_counts())

Severity
2    5671502
3    1136465
4     178538
1      65051
Name: count, dtype: int64


In [7]:

# Zählen Sie NaN-Werte in jedem Feature
nan_counts = accidents_encoded.isna().sum()

# Ergebnis anzeigen
print(nan_counts)


Weather_Condition    0
Amenity              0
Bump                 0
Crossing             0
Give_Way             0
Junction             0
No_Exit              0
Railway              0
Roundabout           0
Station              0
Stop                 0
Traffic_Calming      0
Traffic_Signal       0
Turning_Loop         0
Sunrise_Sunset       0
Severity             0
Start_Lng            0
Start_Lat            0
Temperature(F)       0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Speed(mph)      0
dtype: int64


In [8]:
#Features alles außer Severity
X = accidents_encoded.drop(columns=['Severity'])
#target         
y = accidents_encoded['Severity']


#scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Feature-Selection und LGBM Classifier
dim_reduction = VarianceThreshold(0.1)
lgbm_classifier = LGBMClassifier(n_estimator = 50, random_state=42)

# Trainings- und Testdaten aufteilen
X_train, X_test1, y_train, y_test1 = train_test_split(X_scaled, y, test_size=0.5, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)

print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)

X_train Shape: (3525778, 22)
y_train Shape: (3525778,)


In [10]:
# Pipeline erstellen
pipeline = Pipeline([
    ('dim_reduction', dim_reduction),
    ('classifier', lgbm_classifier)
])

# Trainieren der Pipeline
pipeline.fit(X_train, y_train)

# Vorhersagen machen
predicted_dev = pipeline.predict(X_dev)

# Varianz der einzelnen Features ausgeben
print("Varianzen der Features nach VarianceThreshold:")
print(dim_reduction.variances_)
print("-----------------------------------------------------------------------")
# Bericht für die Pipeline ausgeben
print("Classification Report für die Pipeline (VarianceThreshold + SVC):")
print(metrics.classification_report(y_dev, predicted_dev))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1266
[LightGBM] [Info] Number of data points in the train set: 3525778, number of used features: 21
[LightGBM] [Info] Start training from score -4.684127
[LightGBM] [Info] Start training from score -0.217535
[LightGBM] [Info] Start training from score -1.826797
[LightGBM] [Info] Start training from score -3.675698
Varianzen der Features nach VarianceThreshold:
[1.00039496 1.00046077 1.01980163 1.00113702 1.00100431 0.99898987
 1.00159693 0.99970538 1.02564015 1.00198833 1.00105405 1.01087735
 1.00113847 0.         0.99998773 0.99998511 1.00103585 1.00006534
 1.00022519 0.99975995 1.00407491 0.99257392]
-----------------------------------------------------------------------
Classification Report für die Pipeline (VarianceThreshold + SVC

In [11]:
#Identifizierung von Features mit unter 0.1 Variance:

X = accidents_encoded.drop(columns=['Severity'])
                           
y = accidents_encoded['Severity']
#set features

#scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
#Ausselektiert 'Amenity' (Varianz: 0.0123) 'Bump' (Varianz: 0.000474) 
#'Give_Way' (Varianz: 0.101) 'Junction' (Varianz: 0.00462) 'No_Exit' (Varianz: 0.0676) 
#'Railway' (Varianz: 0.00257) 'Roundabout' (Varianz: 0.00851) 'Stop' (Varianz: 3.40e-05) 
#'Traffic_Calming' (Varianz: 0.0258) 'Traffic_Signal' (Varianz: 0.0269) 
#'Turning_Loop' (Varianz: 0.001) 'Sunrise_Sunset' (Varianz: 0.126)

X_selected = accidents_encoded.drop(columns=['Severity', 'Amenity', 'Bump', 'Give_Way','Junction', 'No_Exit' , 'Railway',
                                   'Roundabout','Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset'])

y_selected = accidents['Severity']


#scale features
scaler = StandardScaler()
X_selected_scaled = scaler.fit_transform(X_selected)

In [13]:
# Trainings- und Testdaten aufteilen
X_selected_train, X_selected_test1, y_selected_train, y_selected_test1 = train_test_split(X_selected_scaled, y_selected, test_size=0.5, random_state=42)
X_selected_dev, X_selected_test, y_selected_dev, y_selected_test = train_test_split(X_selected_test1, y_selected_test1, test_size=0.5, random_state=42)

print("X_selected_train Shape:", X_selected_train.shape)
print("Y_selected_train Shape:", y_selected_train.shape)


X_selected_train Shape: (3525778, 10)
Y_selected_train Shape: (3525778,)


In [14]:
# Pipeline erstellen
pipeline_selected = Pipeline([
    ('dim_reduction', dim_reduction),
    ('classifier', lgbm_classifier)
])

# Trainieren der Pipeline
pipeline_selected.fit(X_selected_train, y_selected_train)

# Vorhersagen machen
selected_predicted_dev = pipeline_selected.predict(X_selected_dev)

# Varianz der einzelnen Features ausgeben
print("Varianzen der Features nach VarianceThreshold:")
print(dim_reduction.variances_)
print("-----------------------------------------------------------------------")
# Bericht für die Pipeline ausgeben
print("Classification Report für die Pipeline (VarianceThreshold + SVC):")
print(metrics.classification_report(y_selected_dev, selected_predicted_dev))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1233
[LightGBM] [Info] Number of data points in the train set: 3525778, number of used features: 10
[LightGBM] [Info] Start training from score -4.684127
[LightGBM] [Info] Start training from score -0.217535
[LightGBM] [Info] Start training from score -1.826797
[LightGBM] [Info] Start training from score -3.675698
Varianzen der Features nach VarianceThreshold:
[1.00039496 1.00113702 1.00198833 0.99998511 1.00103585 1.00006534
 1.00022519 0.99975995 1.00407491 0.99257392]
-----------------------------------------------------------------------
Classification Report für die Pipeline (VarianceThreshold + SVC):
              precision    recall  f1-score   support

           1       0.14      0.00      0.00     16366
           2       0.8

In [None]:
#Das Ziel wurde nicht erreicht. vorallem die recall-werte und der f1-score haben sich verschlechtert