In [1]:
from sklearn import svm, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

In [2]:
accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')
accidents = accidents.dropna()
accidents = accidents.sample(n=35000, random_state=1)

In [3]:
print(accidents.describe())

#count = Anzahl der Daten
#mean = Mittelwert
#std = Standardabweichung
#Min & Max
#Quartile (25%, 50%, 75%):

         Unnamed: 0      Severity     Start_Lat     Start_Lng  Distance(mi)  \
count  3.500000e+04  35000.000000  35000.000000  35000.000000  35000.000000   
mean   3.531642e+06      2.202286     36.202748    -94.545803      0.568868   
std    2.042209e+06      0.481487      5.118009     17.298071      1.793030   
min    2.250000e+02      1.000000     24.599692   -124.471670      0.000000   
25%    1.767246e+06      2.000000     33.326624   -117.150100      0.000000   
50%    3.525615e+06      2.000000     35.838262    -87.706076      0.037000   
75%    5.318387e+06      2.000000     40.127564    -80.355472      0.486000   
max    7.051279e+06      4.000000     48.986462    -69.220414    112.968000   

       Temperature(F)   Humidity(%)  Pressure(in)  Visibility(mi)  \
count    35000.000000  35000.000000  35000.000000    35000.000000   
mean        61.848040     64.316743     29.511783        9.121617   
std         19.002648     22.779095      1.021440        2.618279   
min        -

In [4]:
#featureselection
X = accidents[['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
               'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
               'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

y = accidents['Severity']

#Encoding text/categorical variables
le = LabelEncoder()
X.loc[:,'Weather_Condition']= le.fit_transform(X['Weather_Condition'])

In [5]:
# Feature-Selection und SVC-Classifier
dim_reduction = VarianceThreshold(0.1)
classifier = svm.SVC()

# Trainings- und Testdaten aufteilen
X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.5, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)

print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)

X_train Shape: (17500, 19)
y_train Shape: (17500,)


In [9]:
# Pipeline erstellen
pipeline = Pipeline([
    ('dim_reduction', dim_reduction),
    ('classifier', classifier)
])

# Trainieren der Pipeline
pipeline.fit(X_train, y_train)

# Vorhersagen machen
predicted_dev = pipeline.predict(X_dev)

# Varianz der einzelnen Features ausgeben
print("Varianzen der Features nach VarianceThreshold:")
print(dim_reduction.variances_)
print("-----------------------------------------------------------------------")
# Bericht für die Pipeline ausgeben
print("Classification Report für die Pipeline (VarianceThreshold + SVC):")
print(metrics.classification_report(y_dev, predicted_dev))


Varianzen der Features nach VarianceThreshold:
[3.61291495e+02 5.16489817e+02 1.04559647e+00 6.67043743e+00
 3.12820863e+01 2.88165346e+02 1.10186939e-02 6.28176327e-04
 1.02368405e-01 4.55053061e-03 6.82804898e-02 2.16671347e-03
 8.27325388e-03 0.00000000e+00 2.63520784e-02 2.72699396e-02
 1.02751347e-03 1.25854511e-01 0.00000000e+00]
-----------------------------------------------------------------------
Classification Report für die Pipeline (VarianceThreshold + SVC):
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        89
           2       0.80      1.00      0.89      6960
           3       0.00      0.00      0.00      1454
           4       0.00      0.00      0.00       247

    accuracy                           0.80      8750
   macro avg       0.20      0.25      0.22      8750
weighted avg       0.63      0.80      0.70      8750



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
