# Data preparation


- Join CSV files
- brief overview about the data so it can be cleaned and preprocessed
- Cleaning and preprocessing
    - rename columns 
    - convert datatypes
    - Duplicates
    - Data Aggreagation
    - impute missing values


In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import pandas as pd
import os


## Join Data
In this chapter we jon the CSV files that have data about car accidents in germany in a time span from 2016 to 2022. The source of the data is https://unfallatlas.statistikportal.de/

In [2]:

#merge Traffic Accident Data from 2016 to 2022
folder_path = "C:\Projekte\TDS\TDS2324-TrafficAccidents\Data\TrafficAccidentData"
file_list = os.listdir(folder_path)
merged_data = pd.DataFrame()

# Iterate over each file in the folder and add to merged_data
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    data = pd.read_csv(file_path,sep=';')
    merged_data = pd.concat([merged_data, data], ignore_index=True)

print(merged_data.columns)

  data = pd.read_csv(file_path,sep=';')
  data = pd.read_csv(file_path,sep=';')


Index(['FID', 'OBJECTID', 'ULAND', 'UREGBEZ', 'UKREIS', 'UGEMEINDE', 'UJAHR',
       'UMONAT', 'USTUNDE', 'UWOCHENTAG', 'UKATEGORIE', 'UART', 'UTYP1',
       'ULICHTVERH', 'STRZUSTAND', 'IstRad', 'IstPKW', 'IstFuss', 'IstKrad',
       'IstGkfz', 'IstSonstige', 'LINREFX', 'LINREFY', 'XGCSWGS84',
       'YGCSWGS84', 'UIDENTSTLA', 'OBJECTID_1', 'UIDENTSTLAE'],
      dtype='object')


## Introducing the data set

Before we start cleaning and pre-processing the data, we create a brief overview 

In [3]:
merged_data.head(10)

Unnamed: 0,FID,OBJECTID,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,...,IstKrad,IstGkfz,IstSonstige,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84,UIDENTSTLA,OBJECTID_1,UIDENTSTLAE
0,0.0,1.0,1,0,53,120,2016,1,9,5,...,0,0.0,0,606982393999999970000,5954659924999999800000,10621659329000000,53729614888000000,,,
1,1.0,2.0,1,0,57,10,2016,1,17,3,...,0,0.0,0,574882533000000050000,6011440850999999800000,10149175783000000,54245452583999999,,,
2,2.0,3.0,1,0,62,8,2016,1,0,5,...,0,0.0,0,599934687500000000000,5964608596199999600000,10518094344000000,53820402504999997,,,
3,3.0,4.0,1,0,3,0,2016,1,15,5,...,0,0.0,1,610709348699999970000,5968284242300000000000,10683020702000000,53851243101000001,,,
4,4.0,5.0,1,0,55,28,2016,1,14,1,...,0,0.0,0,605690790400000060000,6009152214700000400000,10620986018000000,54219458582000001,,,
5,5.0,6.0,1,0,61,44,2016,1,17,6,...,0,0.0,0,542547328499999950000,5963921747000000400000,9646338030000001,53822093844999998,,,
6,6.0,7.0,1,0,60,54,2016,1,17,3,...,0,0.0,0,558412407499999970000,5969889909599999900000,9888452573000000,53874189672000000,,,
7,7.0,8.0,1,0,55,29,2016,1,11,1,...,0,0.0,0,623911704400000050000,6008211261900000300000,10899895740000000,54206924608999998,,,
8,8.0,9.0,1,0,60,85,2016,1,16,2,...,0,0.0,0,579664308100000020000,5965471277499999900000,10210478533000000,53831655902000001,,,
9,9.0,10.0,1,0,54,126,2016,1,10,4,...,0,0.0,1,494320527400000020000,6066809662200000100000,8911765602999999,54748506792999997,,,


In [4]:
merged_data.shape

(1554834, 28)

In [5]:
merged_data.describe()

Unnamed: 0,FID,OBJECTID,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,...,UTYP1,ULICHTVERH,STRZUSTAND,IstRad,IstPKW,IstFuss,IstKrad,IstGkfz,IstSonstige,OBJECTID_1
count,151673.0,1342966.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,...,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1554834.0,1359605.0,1554834.0,211868.0
mean,75836.0,115400.6,7.431094,2.788656,41.91415,69.52885,2019.268,6.766878,13.31008,4.102703,...,3.819214,0.4294851,0.2764469,0.287814,0.7866357,0.08647933,0.1412794,0.04938934,0.1111977,105934.5
std,43784.368027,69824.92,3.588564,2.349308,25.8804,111.6782,1.918956,3.214572,4.776594,1.868558,...,2.140342,0.7880731,0.4945796,0.4527441,0.4096829,0.2810706,0.3483096,0.2166797,0.3143769,61161.167754
min,0.0,1.0,1.0,0.0,1.0,0.0,2016.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,37918.0,55957.25,5.0,1.0,15.0,0.0,2018.0,4.0,10.0,3.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,52967.75
50%,75836.0,111914.0,8.0,3.0,39.0,20.0,2019.0,7.0,14.0,4.0,...,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,105934.5
75%,113754.0,171111.0,9.0,4.0,63.0,113.0,2021.0,9.0,17.0,6.0,...,6.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,158901.25
max,151672.0,268370.0,16.0,9.0,91.0,718.0,2022.0,12.0,23.0,7.0,...,7.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,211868.0


In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554834 entries, 0 to 1554833
Data columns (total 28 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   FID          151673 non-null   float64
 1   OBJECTID     1342966 non-null  float64
 2   ULAND        1554834 non-null  int64  
 3   UREGBEZ      1554834 non-null  int64  
 4   UKREIS       1554834 non-null  int64  
 5   UGEMEINDE    1554834 non-null  int64  
 6   UJAHR        1554834 non-null  int64  
 7   UMONAT       1554834 non-null  int64  
 8   USTUNDE      1554834 non-null  int64  
 9   UWOCHENTAG   1554834 non-null  int64  
 10  UKATEGORIE   1554834 non-null  int64  
 11  UART         1554834 non-null  int64  
 12  UTYP1        1554834 non-null  int64  
 13  ULICHTVERH   1554834 non-null  int64  
 14  STRZUSTAND   1554834 non-null  int64  
 15  IstRad       1554834 non-null  int64  
 16  IstPKW       1554834 non-null  int64  
 17  IstFuss      1554834 non-null  int64  
 18  Is

# Cleaning and preprocessing

### Drop Data
features being droped that are unnecessay for this project since they are partly foreign keys or ids that are not needed or named in the different datasets thta where joined differently

In [7]:
merged_data.drop(['FID', 'OBJECTID', 'OBJECTID_1', 'UIDENTSTLA', 'UIDENTSTLAE'], axis=1, inplace=True)
print(merged_data.columns)
#save the merged data to a csv file in folder TrafficAccidentData



Index(['ULAND', 'UREGBEZ', 'UKREIS', 'UGEMEINDE', 'UJAHR', 'UMONAT', 'USTUNDE',
       'UWOCHENTAG', 'UKATEGORIE', 'UART', 'UTYP1', 'ULICHTVERH', 'STRZUSTAND',
       'IstRad', 'IstPKW', 'IstFuss', 'IstKrad', 'IstGkfz', 'IstSonstige',
       'LINREFX', 'LINREFY', 'XGCSWGS84', 'YGCSWGS84'],
      dtype='object')


### Convert datatypes
Since the coordinates are saved as Strings we  have to convert them in a datatype we can later work with. 

In [8]:
merged_data['XGCSWGS84'] = merged_data['XGCSWGS84'].astype(str).str.replace(',', '.').astype(float)
merged_data['YGCSWGS84'] = merged_data['YGCSWGS84'].astype(str).str.replace(',', '.').astype(float)
merged_data['LINREFX'] = merged_data['LINREFX'].astype(str).str.replace(',', '.').astype(float)
merged_data['LINREFY'] = merged_data['LINREFY'].astype(str).str.replace(',', '.').astype(float)

merged_data.head(10)


Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,IstRad,IstPKW,IstFuss,IstKrad,IstGkfz,IstSonstige,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84
0,1,0,53,120,2016,1,9,5,2,8,...,0,1,0,0,0.0,0,606982.394,5954660.0,10.621659,53.729615
1,1,0,57,10,2016,1,17,3,3,1,...,0,1,0,0,0.0,0,574882.533,6011441.0,10.149176,54.245453
2,1,0,62,8,2016,1,0,5,3,9,...,0,1,0,0,0.0,0,599934.6875,5964609.0,10.518094,53.820403
3,1,0,3,0,2016,1,15,5,3,5,...,1,0,0,0,0.0,1,610709.3487,5968284.0,10.683021,53.851243
4,1,0,55,28,2016,1,14,1,3,8,...,0,1,0,0,0.0,0,605690.7904,6009152.0,10.620986,54.219459
5,1,0,61,44,2016,1,17,6,3,9,...,0,1,0,0,0.0,0,542547.3285,5963922.0,9.646338,53.822094
6,1,0,60,54,2016,1,17,3,3,6,...,1,0,1,0,0.0,0,558412.4075,5969890.0,9.888453,53.87419
7,1,0,55,29,2016,1,11,1,2,2,...,0,1,0,0,0.0,0,623911.7044,6008211.0,10.899896,54.206925
8,1,0,60,85,2016,1,16,2,2,4,...,0,1,0,0,0.0,0,579664.3081,5965471.0,10.210479,53.831656
9,1,0,54,126,2016,1,10,4,2,5,...,0,1,0,0,0.0,1,494320.5274,6066810.0,8.911766,54.748507


In [9]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554834 entries, 0 to 1554833
Data columns (total 23 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   ULAND        1554834 non-null  int64  
 1   UREGBEZ      1554834 non-null  int64  
 2   UKREIS       1554834 non-null  int64  
 3   UGEMEINDE    1554834 non-null  int64  
 4   UJAHR        1554834 non-null  int64  
 5   UMONAT       1554834 non-null  int64  
 6   USTUNDE      1554834 non-null  int64  
 7   UWOCHENTAG   1554834 non-null  int64  
 8   UKATEGORIE   1554834 non-null  int64  
 9   UART         1554834 non-null  int64  
 10  UTYP1        1554834 non-null  int64  
 11  ULICHTVERH   1554834 non-null  int64  
 12  STRZUSTAND   1554834 non-null  int64  
 13  IstRad       1554834 non-null  int64  
 14  IstPKW       1554834 non-null  int64  
 15  IstFuss      1554834 non-null  int64  
 16  IstKrad      1554834 non-null  int64  
 17  IstGkfz      1359605 non-null  float64
 18  Is

### Checking for Duplicates
Checking for duplicates by making a copy and doubling our dataset. Afterwards the doubles are being dropped. Are there no douplictates in the copied dataset, the original doesnt have duplicates either.

In [10]:
temp_df = merged_data.copy()
temp_df.drop_duplicates(inplace=True)
num_rows_merged_data = merged_data.shape[0]
num_rows_temp_df = temp_df.shape[0]

print("number of rows in merged_data: ", num_rows_merged_data)
print("number of rows in temp_df: ", num_rows_temp_df)    
print("difference in rows: ", num_rows_temp_df - num_rows_merged_data)

number of rows in merged_data:  1554834
number of rows in temp_df:  1554694
difference in rows:  -140


Since there is a difference of 140 we it is probible to have duplicates in our original dataset.

In [11]:
# show duplicates 
duplicates = merged_data[merged_data.duplicated(keep=False)]
duplicates.head(14)

Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,IstRad,IstPKW,IstFuss,IstKrad,IstGkfz,IstSonstige,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84
52335,7,3,39,43,2016,3,13,4,3,1,...,0,1,0,0,0.0,0,453653.9008,5523542.0,8.355149,49.862495
52339,7,3,39,43,2016,3,13,4,3,1,...,0,1,0,0,0.0,0,453653.9008,5523542.0,8.355149,49.862495
54140,7,1,37,69,2016,6,5,5,3,2,...,0,1,0,0,0.0,0,376200.4225,5583253.0,7.258458,50.388285
54478,7,1,37,69,2016,6,5,5,3,2,...,0,1,0,0,0.0,0,376200.4225,5583253.0,7.258458,50.388285
58019,7,1,43,59,2016,9,14,5,3,2,...,0,1,0,0,0.0,0,409948.5422,5595250.0,7.730177,50.502286
58247,7,1,43,59,2016,9,14,5,3,2,...,0,1,0,0,0.0,0,409948.5422,5595250.0,7.730177,50.502286
58314,7,3,12,0,2016,10,21,7,3,9,...,0,1,0,1,0.0,0,406861.151,5478158.0,7.715006,49.448939
58484,7,3,12,0,2016,10,21,7,3,9,...,0,1,0,1,0.0,0,406861.151,5478158.0,7.715006,49.448939
60015,7,3,38,6,2016,12,19,1,3,2,...,0,1,0,0,0.0,0,452163.5267,5473644.0,8.340503,49.413595
60256,7,3,38,6,2016,12,19,1,3,2,...,0,1,0,0,0.0,0,452163.5267,5473644.0,8.340503,49.413595


The table shows that there are indeed duplicates in the dataset. Since the coordinates are exremly detailed, it is very unlilkely that the accdients happend the same time with the same particitpantson the exact same spot. After doublechecking the accidents on the map we decided to drop them.

In [12]:
#drop duplicates
merged_data.drop_duplicates(inplace=True)
num_rows_merged_data = merged_data.shape[0]

print("number of rows in merged_data: ", num_rows_merged_data)
print("number of rows in temp_df: ", num_rows_temp_df)
print("difference in rows: ", num_rows_temp_df - num_rows_merged_data)


number of rows in merged_data:  1554694
number of rows in temp_df:  1554694
difference in rows:  0


Since they have now the same amount of row, there are no further duplicates in the dataset.


### Renaming columns

In [13]:
merged_data.columns

Index(['ULAND', 'UREGBEZ', 'UKREIS', 'UGEMEINDE', 'UJAHR', 'UMONAT', 'USTUNDE',
       'UWOCHENTAG', 'UKATEGORIE', 'UART', 'UTYP1', 'ULICHTVERH', 'STRZUSTAND',
       'IstRad', 'IstPKW', 'IstFuss', 'IstKrad', 'IstGkfz', 'IstSonstige',
       'LINREFX', 'LINREFY', 'XGCSWGS84', 'YGCSWGS84'],
      dtype='object')

In [14]:
# Cols in lower case   
merged_data.columns =  [col.lower() for col in merged_data.columns]
merged_data.columns



Index(['uland', 'uregbez', 'ukreis', 'ugemeinde', 'ujahr', 'umonat', 'ustunde',
       'uwochentag', 'ukategorie', 'uart', 'utyp1', 'ulichtverh', 'strzustand',
       'istrad', 'istpkw', 'istfuss', 'istkrad', 'istgkfz', 'istsonstige',
       'linrefx', 'linrefy', 'xgcswgs84', 'ygcswgs84'],
      dtype='object')

Since the U in the colnames stands for 'Unfall' and its obvious that we are having a dataset with data of accidents the U will be deleted to make things clearer. Furthermore the 1 in typ will be deleted and a _ will be added for the boolean categories.

In [15]:
merged_data.columns = merged_data.columns.str.lstrip('u')
merged_data.rename(columns=lambda x: x.replace('ist', 'ist_') if x.startswith('ist') else x, inplace=True)
merged_data.rename(columns={
    'typ1': 'typ',
    'lichtverh': 'licht' 
    }, inplace=True)

merged_data.columns


Index(['land', 'regbez', 'kreis', 'gemeinde', 'jahr', 'monat', 'stunde',
       'wochentag', 'kategorie', 'art', 'typ', 'licht', 'strzustand',
       'ist_rad', 'ist_pkw', 'ist_fuss', 'ist_krad', 'ist_gkfz',
       'ist_sonstige', 'linrefx', 'linrefy', 'xgcswgs84', 'ygcswgs84'],
      dtype='object')

### Data Aggregation
Since it is easier for the analysis later to search for cities with the Amtliche Gemeinde Schlüssel (AGS) we are aggregating this feature from the features land, regbez, kreis, gemeinde.

In [16]:
# Funktion zur Erstellung des AGS
def create_ags(row):
    land = f"{int(row['land']):02d}"  
    regbez = f"{int(row['regbez']):01d}"
    kreis = f"{int(row['kreis']):02d}"  
    gemeinde = f"{int(row['gemeinde']):03d}"  
    return land + regbez + kreis + gemeinde

# Anwenden der Funktion auf jede Zeile und Hinzufügen einer neuen Spalte "AGS"
merged_data['ags'] = merged_data.apply(create_ags, axis=1).astype(str)

# Ausgabe der ersten 5 Zeilen mit dem neuen AGS zur Überprüfung
print(merged_data[['land', 'regbez', 'kreis', 'gemeinde', 'ags']].sample(20))


         land  regbez  kreis  gemeinde       ags
2850        1       0     56        41  01056041
230364      9       2     78       134  09278134
1311856     3       3     60         1  03360001
1175278     9       6     76       141  09676141
442776      9       5     64         0  09564000
805742      5       3     82        28  05382028
866862      9       7     61         0  09761000
867921      9       1     87       168  09187168
1411277     9       5     64         0  09564000
223687      9       5     64         0  09564000
1000983     9       2     61         0  09261000
262644     12       0     73        97  12073097
434160      9       3     72       112  09372112
1187266    14       5     22       460  14522460
514444     14       6     12         0  14612000
110536      8       2     15       111  08215111
138276      9       5     62         0  09562000
961017     12       0     71       294  12071294
481102      3       2     56        22  03256022
1258959     9       

### Missing Values

In [17]:
# check for missing values
merged_data.isnull().sum()



land                 0
regbez               0
kreis                0
gemeinde             0
jahr                 0
monat                0
stunde               0
wochentag            0
kategorie            0
art                  0
typ                  0
licht                0
strzustand           0
ist_rad              0
ist_pkw              0
ist_fuss             0
ist_krad             0
ist_gkfz        195214
ist_sonstige         0
linrefx              0
linrefy              0
xgcswgs84            0
ygcswgs84            0
ags                  0
dtype: int64

As we can see the data set has almost no null values except for the binary variable 'ist_gkfz'. This will be investigated further and the data imputed where appropriate.

In [18]:
missing_data = merged_data[merged_data['ist_gkfz'].isnull()]
missing_data.sample(10)

Unnamed: 0,land,regbez,kreis,gemeinde,jahr,monat,stunde,wochentag,kategorie,art,...,ist_pkw,ist_fuss,ist_krad,ist_gkfz,ist_sonstige,linrefx,linrefy,xgcswgs84,ygcswgs84,ags
258162,7,1,37,226,2017,8,19,5,3,5,...,1,0,0,,0,401315.4359,5583928.0,7.61146,50.399107,7137226
279493,3,3,56,7,2017,2,13,7,3,0,...,1,0,0,,0,481078.36,5900033.0,8.716416,53.249225,3356007
210943,8,1,21,0,2017,12,7,2,3,2,...,1,0,0,,0,517785.5115,5444026.0,9.24389,49.148799,8121000
298446,8,2,11,0,2017,2,7,3,3,5,...,1,0,1,,0,441195.003,5405034.0,8.199291,48.795521,8211000
152033,1,0,3,0,2017,2,16,2,3,5,...,1,0,0,,0,609967.5148,5971478.0,10.672893,53.880094,1003000
218477,15,0,87,130,2017,1,14,3,3,2,...,1,0,0,,1,674138.0895,5708537.0,11.508975,51.501109,15087130
293283,6,4,12,0,2017,5,23,3,3,5,...,1,0,0,,0,477163.5219,5549988.0,8.680679,50.101707,6412000
238697,9,2,76,115,2017,7,16,6,2,9,...,0,0,1,,0,804066.709,5446333.0,13.165596,49.0946,9276115
307648,8,2,12,0,2017,9,17,7,3,2,...,1,0,0,,0,460827.8846,5427608.0,8.464439,49.000126,8212000
259535,7,1,31,25,2017,10,8,5,3,8,...,1,0,0,,1,378692.8345,5593526.0,7.290179,50.48115,7131025


In [19]:
missing_data_2017 = merged_data[merged_data['ist_gkfz'].isnull() & (merged_data['jahr'] == 2017)]
all_null_from_2017 = len(missing_data_2017) == len(missing_data)
not_null_from_2017 = merged_data[merged_data['ist_gkfz'].notnull() & merged_data['jahr'] == 2017].any().any()
print("All the null values for feature is_gkfz are from year 2017: ", all_null_from_2017)
print("There are notnull values for ist_gkfz in year 2017: ", not_null_from_2017)



All the null values for feature is_gkfz are from year 2017:  True
There are notnull values for ist_gkfz in year 2017:  False


Since all null values are only for the feature is_gkfz from 2017 and there are no notnull values for this feature in this year, it can be concluded that no data was collected for is_gkfz in this year.

In the following we will impute the missing values. We will check for patterns and correlation with other features. So we can train a logistic regression model, that imputes our missing values for the year 2017 

In [20]:
# Calculate the correlation matrix
correlation_matrix = merged_data.corr()

# Select the features with high correlation with is_gkfz
relevant_features = correlation_matrix['ist_gkfz'].abs().sort_values(ascending=False).index[1:]

# map correlation to each feature
correlation_to_ist_gkfz = {feature: correlation_matrix.loc['ist_gkfz', feature] for feature in relevant_features}


# Print the relevant features
print(correlation_to_ist_gkfz)


{'ist_rad': -0.10733862747940964, 'typ': 0.09456375198947359, 'stunde': -0.07848527764753568, 'ist_krad': -0.057456393171918876, 'ist_pkw': -0.057187447091856254, 'kategorie': -0.0519069042703419, 'ist_fuss': -0.046028361121884936, 'art': -0.0393140192823976, 'kreis': 0.02431316433109911, 'licht': -0.019836170951205146, 'gemeinde': 0.018754307961221503, 'wochentag': -0.013051362686227398, 'monat': -0.00970261731071049, 'jahr': -0.00842786096139455, 'linrefy': -0.0062591436233858395, 'ags': 0.0062353802525829845, 'ygcswgs84': -0.006115356146857279, 'land': 0.005696697403717505, 'regbez': 0.005381606881152162, 'ist_sonstige': -0.00532538672720455, 'linrefx': 0.003561234236690129, 'strzustand': 0.00337820718997781, 'xgcswgs84': 0.0032319654037303116}


In [21]:
imputation_data = merged_data.copy()

# delete data with missing values
imputation_data.dropna(inplace=True)

#defining the features and target
features = imputation_data.drop(['ist_gkfz', 'jahr', 'monat', 'land', 'regbez','xgcswgs84', 'ygcswgs84', 'linrefx', 'linrefy'],  axis=1)
target = imputation_data['ist_gkfz']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the training data and transform
X_train = scaler.fit_transform(X_train)

# Transform the test data
X_test = scaler.transform(X_test)

# Create a logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Fit the model
log_reg.fit(X_train, y_train)

# Calculate accuracy score on test data
accuracy_score = accuracy_score(log_reg.predict(X_test), y_test)
print(accuracy_score)
precision_score = precision_score(log_reg.predict(X_test), y_test)
print(precision_score)




0.9556264159825816
0.11289368623391986


In [22]:
# Select the rows with missing values for ist_gkfz
missing_data = merged_data[merged_data['ist_gkfz'].isnull()]

# Extract the features for imputation
imputation_features = missing_data.drop(['ist_gkfz', 'jahr', 'monat', 'land', 'regbez', 'xgcswgs84', 'ygcswgs84', 'linrefx', 'linrefy'], axis=1)

# Scale the features using the previously created scaler
imputation_features_scaled = scaler.transform(imputation_features)

# Impute the missing values using the logistic regression model
imputed_values = log_reg.predict(imputation_features_scaled)

# Assign the imputed values to the missing rows in the merged_data dataframe
merged_data.loc[merged_data['ist_gkfz'].isnull(), 'ist_gkfz'] = imputed_values

# Check if there are any remaining missing values
merged_data.isnull().sum()


land            0
regbez          0
kreis           0
gemeinde        0
jahr            0
monat           0
stunde          0
wochentag       0
kategorie       0
art             0
typ             0
licht           0
strzustand      0
ist_rad         0
ist_pkw         0
ist_fuss        0
ist_krad        0
ist_gkfz        0
ist_sonstige    0
linrefx         0
linrefy         0
xgcswgs84       0
ygcswgs84       0
ags             0
dtype: int64

saving the data as csv  

In [23]:
merged_data.to_csv("C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\all_16_22.csv")

In [24]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1554694 entries, 0 to 1554833
Data columns (total 24 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   land          1554694 non-null  int64  
 1   regbez        1554694 non-null  int64  
 2   kreis         1554694 non-null  int64  
 3   gemeinde      1554694 non-null  int64  
 4   jahr          1554694 non-null  int64  
 5   monat         1554694 non-null  int64  
 6   stunde        1554694 non-null  int64  
 7   wochentag     1554694 non-null  int64  
 8   kategorie     1554694 non-null  int64  
 9   art           1554694 non-null  int64  
 10  typ           1554694 non-null  int64  
 11  licht         1554694 non-null  int64  
 12  strzustand    1554694 non-null  int64  
 13  ist_rad       1554694 non-null  int64  
 14  ist_pkw       1554694 non-null  int64  
 15  ist_fuss      1554694 non-null  int64  
 16  ist_krad      1554694 non-null  int64  
 17  ist_gkfz      1554694 non-null  