In [1]:
from sklearn import svm, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMClassifier
import pandas as pd

In [2]:
accidents = pd.read_csv('../../data/MA3_finished.csv', index_col='ID')

In [3]:
accidents.dtypes

Unnamed: 0                 int64
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Temperature(F)           float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout

In [4]:
#featureselection
#features to encode
obj_bool_features = accidents[['Division','Region','State Name','Astronomical_Twilight','Nautical_Twilight','Civil_Twilight',
                               'Sunrise_Sunset','Weather_Condition','Sunrise_Sunset','Airport_Code','Timezone','Country','Zipcode',
                               'State','County', 'City' ,'Street','Description','Start_Time', 'End_Time', 'Weather_Condition', 
                               'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 
                               'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset']]

#encode features above
le = LabelEncoder()
#create copy of df
accidents_encoded = pd.DataFrame()
#LabelEncoder can only encode one column at a time --> forloop
#obj_bool_feat = le.fit_transform(obj_bool_features)
for X in obj_bool_features:
    accidents_encoded[X] = le.fit_transform(accidents[X])

#float/int features
float_int_features = accidents[['Severity','Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)', 
                                'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',]]

for X in float_int_features:
    accidents_encoded[X] = accidents[X]

#check if combination worked 
accidents_encoded.columns

Index(['Division', 'Region', 'State Name', 'Astronomical_Twilight',
       'Nautical_Twilight', 'Civil_Twilight', 'Sunrise_Sunset',
       'Weather_Condition', 'Airport_Code', 'Timezone', 'Country', 'Zipcode',
       'State', 'County', 'City', 'Street', 'Description', 'Start_Time',
       'End_Time', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction',
       'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Severity',
       'Start_Lng', 'Start_Lat', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)'],
      dtype='object')

In [5]:
#check if encoding worked 
print(accidents_encoded)

         Division  Region  State Name  Astronomical_Twilight  \
0               0       0          33                      0   
1               0       0          33                      0   
2               0       0          33                      0   
3               0       0          33                      0   
4               0       0          33                      0   
...           ...     ...         ...                    ...   
7051551         5       3           3                      0   
7051552         5       3           3                      0   
7051553         5       3           3                      0   
7051554         5       3           3                      0   
7051555         5       3           3                      0   

         Nautical_Twilight  Civil_Twilight  Sunrise_Sunset  Weather_Condition  \
0                        0               1               1                 86   
1                        0               0               1           

In [6]:
#check amount of severity for each level
print(accidents_encoded['Severity'].value_counts())

Severity
2    5671502
3    1136465
4     178538
1      65051
Name: count, dtype: int64


In [7]:
# Zählen Sie NaN-Werte in jedem Feature
nan_counts = accidents_encoded.isna().sum()
# Ergebnis anzeigen
print(nan_counts)

Division                 0
Region                   0
State Name               0
Astronomical_Twilight    0
Nautical_Twilight        0
Civil_Twilight           0
Sunrise_Sunset           0
Weather_Condition        0
Airport_Code             0
Timezone                 0
Country                  0
Zipcode                  0
State                    0
County                   0
City                     0
Street                   0
Description              0
Start_Time               0
End_Time                 0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
Severity                 0
Start_Lng                0
Start_Lat                0
Temperature(F)           0
Humidity(%)              0
P

In [8]:
#Features alles außer Severity
X = accidents_encoded.drop(columns=['Severity'])
#target         
y = accidents_encoded['Severity']

#scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
# Feature-Selection und LGBM Classifier
dim_reduction = VarianceThreshold(0.1)
lgbm_classifier = LGBMClassifier(random_state=42)

# Trainings- und Testdaten aufteilen
#Test_size = 0.5 -> 0.2 kein Unterschied im Ergebnis
X_train, X_test1, y_train, y_test1 = train_test_split(X_scaled, y, test_size=0.5, stratify=y, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test1, y_test1, test_size=0.5, stratify=y_test1, random_state=42)

print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)
print("X_dev Shape:",X_dev.shape)
print("y_dev Shape:",y_dev.shape)

X_train Shape: (3525778, 39)
y_train Shape: (3525778,)
X_dev Shape: (1762889, 39)
y_dev Shape: (1762889,)


In [20]:
# Auf den Trainingsdaten lernen
lgbm_classifier.fit(X_train, y_train)

# Vorhersagen für die Testdaten machen und berichten
predicted = lgbm_classifier.predict(X_dev)

print(f"Classification report for classifier {lgbm_classifier}:\n"
      f"{metrics.classification_report(y_dev, predicted)}\n")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.631483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3425
[LightGBM] [Info] Number of data points in the train set: 3525778, number of used features: 37
[LightGBM] [Info] Start training from score -4.685847
[LightGBM] [Info] Start training from score -0.217794
[LightGBM] [Info] Start training from score -1.825325
[LightGBM] [Info] Start training from score -3.676202
Classification report for classifier LGBMClassifier(random_state=42):
              precision    recall  f1-score   support

           1       0.76      0.64      0.70     16263
           2       0.94      0.97      0.96   1417875
           3       0.86      0.78      0.82    284116
           4       0.96      0.47      0.63     44635

    accuracy                           0.93   1762889
   macro avg       0.88      0.72

In [21]:
# Pipeline erstellen
pipeline = Pipeline([
    ('dim_reduction', dim_reduction),
    ('classifier', lgbm_classifier)
])

# Trainieren der Pipeline
pipeline.fit(X_train, y_train)

# Vorhersagen machen
predicted_dev = pipeline.predict(X_dev)

# Varianz der einzelnen Features ausgeben
print("Varianzen der Features nach VarianceThreshold:")
print(dim_reduction.variances_)

print("-----------------------------------------------------------------------")

# Bericht für die Pipeline ausgeben
print(f"Classification report for VarianceThreshold dimensionality reduction:\n"
      f"{metrics.classification_report(y_dev, predicted)}\n")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.203714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3425
[LightGBM] [Info] Number of data points in the train set: 3525778, number of used features: 37
[LightGBM] [Info] Start training from score -4.685847
[LightGBM] [Info] Start training from score -0.217794
[LightGBM] [Info] Start training from score -1.825325
[LightGBM] [Info] Start training from score -3.676202
Varianzen der Features nach VarianceThreshold:
[0.99821092 1.0001091  0.99943914 1.00015535 0.99978423 0.99976302
 0.99973105 1.00045253 0.99974606 0.9996395  0.         1.0005977
 0.99935362 0.99965903 1.00040312 0.99972947 0.99996265 1.00053347
 1.00051734 1.00241615 0.99603963 1.00056802 1.01293362 1.00098845
 0.99763211 1.00422275 0.89743895 1.00205095 1.00070603 0.99169871
 0.99941941 0.         1.00037448 0.99969215 1.0

In [12]:
#Die 0. representiert ausgeschlossene Features in unserem Fall sind es die Features 'Turning Loop' und 'Timezone'
#Start mit 39 Columns - Ende mit 37 Columns
#Before VarianceThreshold :the overhead of testing was 0.631483 seconds.
#After Variance Threshold :the overhead of testing was 0.203714 seconds. 

#Unser Ziel war es die Features zu identifizieren, die eine niedrige Varianz haben und ausgeschlossen werden kann um mögliche 
#Leistung zu steigern


#Es konnten folgende Ziele erreicht werden:

#Analyse: Durch das Entfernen der Features 'Turning Loop' und 'Timezone' 
#konzentrieren Sie sich auf relevantere und variablere Merkmale, was die Analyse verbessert.

#Effizienz: Die Reduzierung der Feature-Anzahl von 39 auf 37 und die verkürzte Testzeit deuten darauf hin,
#dass das Training effizienter wurde, ohne die Aussagekraft der Features zu beeinträchtigen.

#Rauschen reduzieren: Rauschen zu minimieren, indem Features mit geringer Varianz entfernt werden. 
#Dies kann zu einem präziseren Modell beitragen.