In [1]:
from sklearn import svm, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
from imblearn.over_sampling import SMOTE

In [2]:
accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')
accidents = accidents.dropna()
accidents = accidents.sample(n=70000, random_state=1)

In [3]:
print(accidents.describe())

#count = Anzahl der Daten
#mean = Mittelwert
#std = Standardabweichung
#Min & Max werte
#Quartile (25%, 50%, 75%):

         Unnamed: 0      Severity     Start_Lat     Start_Lng  Distance(mi)  \
count  7.000000e+04  70000.000000  70000.000000  70000.000000  70000.000000   
mean   3.511065e+06      2.203957     36.194397    -94.511348      0.567594   
std    2.042222e+06      0.482484      5.121751     17.269038      1.842401   
min    2.400000e+01      1.000000     24.599692   -124.471670      0.000000   
25%    1.740537e+06      2.000000     33.277239   -117.132985      0.000000   
50%    3.504510e+06      2.000000     35.834835    -87.653003      0.035000   
75%    5.291416e+06      2.000000     40.121849    -80.365944      0.483000   
max    7.051429e+06      4.000000     48.986462    -69.220414    176.279999   

       Temperature(F)   Humidity(%)  Pressure(in)  Visibility(mi)  \
count    70000.000000  70000.000000  70000.000000    70000.000000   
mean        61.824187     64.411800     29.514970        9.102158   
std         19.054524     22.805129      1.022298        2.571070   
min        -

In [4]:
#featureselection
X = accidents[['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
               'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
               'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

y = accidents['Severity']

#Encoding text/categorical variables
le = LabelEncoder()
X.loc[:,'Weather_Condition']= le.fit_transform(X['Weather_Condition'])

print(X.describe())

       Temperature(F)   Humidity(%)  Pressure(in)  Visibility(mi)  \
count    70000.000000  70000.000000  70000.000000    70000.000000   
mean        61.824187     64.411800     29.514970        9.102158   
std         19.054524     22.805129      1.022298        2.571070   
min        -27.000000      1.000000     19.860000        0.000000   
25%         49.000000     48.000000     29.340000       10.000000   
50%         64.000000     67.000000     29.840000       10.000000   
75%         76.000000     84.000000     30.020000       10.000000   
max        172.000000    100.000000     56.540000       80.000000   

       Wind_Speed(mph)  
count     70000.000000  
mean          7.710206  
std           6.212866  
min           0.000000  
25%           4.600000  
50%           7.000000  
75%          10.400000  
max         822.800000  


In [5]:
# Feature-Selection und SVC-Classifier
dim_reduction = VarianceThreshold(0.1)
classifier = svm.SVC()

# Trainings- und Testdaten aufteilen
X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.5, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)

print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)

X_train Shape: (35000, 19)
y_train Shape: (35000,)


In [6]:
#Balancing severity für recall, precision
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [7]:
# Pipeline erstellen
pipeline = Pipeline([
    ('dim_reduction', dim_reduction),
    ('classifier', classifier)
])

# Trainieren der Pipeline
pipeline.fit(X_train, y_train)

# Vorhersagen machen
predicted_dev = pipeline.predict(X_dev)

# Varianz der einzelnen Features ausgeben
print("Varianzen der Features nach VarianceThreshold:")
print(dim_reduction.variances_)
print("-----------------------------------------------------------------------")
# Bericht für die Pipeline ausgeben
print("Classification Report für die Pipeline (VarianceThreshold + SVC):")
print(metrics.classification_report(y_dev, predicted_dev))


Varianzen der Features nach VarianceThreshold:
[3.43301627e+02 5.55248945e+02 9.02122274e-01 4.85056658e+00
 2.34245874e+01 3.67549211e+02 1.61812563e-02 9.30875013e-04
 1.51572714e-01 1.29088154e-02 1.20649821e-01 2.08981798e-03
 1.20005783e-02 8.87366562e-06 2.98853866e-02 3.17631488e-02
 1.90421513e-03 1.87717218e-01 0.00000000e+00]
-----------------------------------------------------------------------
Classification Report für die Pipeline (VarianceThreshold + SVC):
              precision    recall  f1-score   support

           1       0.02      0.59      0.04       154
           2       0.84      0.24      0.38     14033
           3       0.24      0.44      0.31      2848
           4       0.04      0.31      0.07       465

    accuracy                           0.28     17500
   macro avg       0.28      0.40      0.20     17500
weighted avg       0.71      0.28      0.36     17500

