In [1]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
#Importamos los datos
df = pd.read_csv("../data/NCDB_1999_to_2014.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# Cambio de nombre de las variables
new_names = ["year", "month", "weekday", "hour", "fatality", "vehicles_involved", "crash_type", "crash_place", "crash_weather",
"surface_state", "road_slope", "traffic_state", "vehicle_id", "vehicle_type", "vehicle_year", "passenger_id", "passenger_sex",
"passenger_age", "passenger_role", "passenger_fatality", "passenger_safety", "passenger_type"]

df.columns = new_names

In [4]:
# El dataset contiene variables que no están disponible en el momento de la colisión y por tanto escapan al objetivo de nuestro ánalisis y
# otras que directamente no son de útilidad como IDs. También debemos prescindir de registros duplicados antes de eliminar los IDs
df = df.drop_duplicates().drop(["passenger_id","passenger_fatality"], axis=1)

In [5]:
# Recodeado de la variable objetivo
# Antes: 1-fatality 2-no fatality
# Después: 1-fatality 0-no fatality
df['fatality'] = df['fatality'].replace({2:0})

In [6]:
#Ponemos la variable objetivo a la derecha del dataframe
new_columns = list(df.columns[0:4]) + list(df.columns[5:]) + list(df.columns[4:5])
df = df[new_columns]

In [7]:
df = df.drop("vehicle_id", axis=1)

In [8]:
# Remplazamos U, UU y UUUU por NAs
df = df.replace({"U": np.nan, "UU": np.nan, "UUUU": np.nan})

In [9]:
#Eliminando los registros con >40% nulos
null_rows = df.isnull().sum(axis=1).sort_values(ascending=False)
nulos_filas = pd.DataFrame(null_rows, columns=['nulos_filas'])  
nulos_filas['target'] = df['fatality'].copy()
nulos_filas['porcentaje_filas']= nulos_filas['nulos_filas']/df.shape[1]
print(nulos_filas)
nulos_40 = list(nulos_filas.index[nulos_filas.porcentaje_filas>=0.40])
df = df.loc[set(df.index)-set(nulos_40)].reset_index()

         nulos_filas  target  porcentaje_filas
5245711           16       1          0.842105
5245712           16       1          0.842105
769190            13       0          0.684211
2884064           13       0          0.684211
2177439           12       0          0.631579
...              ...     ...               ...
2665912            0       0          0.000000
2665913            0       0          0.000000
2665914            0       0          0.000000
2665915            0       0          0.000000
2930693            0       0          0.000000

[5855336 rows x 3 columns]


In [10]:
# Cambio de la variable vehicle_year a años de antigüedad y eliminamos vehicle_year
# Cambiamos passenger_age y vehicles_involved a númericas ninguna de las dos tiene NAs. 
# Sin embargo, passenger_age tiene valores NN para transformar la variable los haremos NA con 'coerce'
df['vehicle_year'] = pd.to_numeric(df.year - pd.to_numeric(df.vehicle_year, errors= "coerce"))
df = df.rename(columns={"vehicle_year": 'vehicle_age'})
df['passenger_age'] = pd.to_numeric(df['passenger_age'],"coerce")
df['vehicles_involved'] = pd.to_numeric(df['vehicles_involved'],"ignore")
df.drop(inplace=True, axis=1, columns='index')

In [11]:
# Month a numerico. Limpieza de valores 0
df["month"] = np.int8(df["month"].replace({"01": 1, "02": 2, "11": 11, "12" : 12}))
df = df[df["month"] != 0]

In [12]:
# Weekday a numerico. Limpieza de valores 0
df["weekday"] = np.int8(df["weekday"].replace({"7": 7, "1": 1, "2": 2, "3" : 3, "4":4, "5":5, "6":6}))
df = df[df["weekday"] != 0]

In [13]:
# Hour a numerico. Limpieza de valores nulos
df = df[df["hour"].notnull()]
df["hour"] = df["hour"].astype("int8")

In [17]:
# Replace M/F to 1/0
df.passenger_sex.replace({"M":1, "F":0}, inplace=True)

# Remove NA values (15k rows, 0.3% of total sample)
df.passenger_sex.replace('[^0-9]+',np.nan,regex=True,inplace=True)
df.passenger_sex.dropna(inplace=True)

In [18]:
# Drop passenger_safety==11, very low sample
df = df.loc[df['passenger_safety'] != "11"]

In [15]:
df

Unnamed: 0,year,month,weekday,hour,vehicles_involved,crash_type,crash_place,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_sex,passenger_age,passenger_role,passenger_safety,passenger_type,fatality
0,1999,1,1,20,2.0,34,,1,5,3,03,06,9.0,M,41.0,11,,1,0
1,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,M,19.0,11,,1,0
2,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,F,20.0,13,02,2,0
3,1999,1,1,8,1.0,01,,5,3,6,18,01,13.0,M,46.0,11,,1,0
4,1999,1,1,8,1.0,01,,5,3,6,18,NN,,M,5.0,99,,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850191,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,M,44.0,11,02,1,0
5850192,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,M,34.0,13,02,2,0
5850193,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,F,35.0,11,02,1,0
5850194,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,M,26.0,13,02,2,0


In [16]:
#FIXME: Pickle file size bigger than csv, pruebo con parquet
# Yo cargaría el dataframe, para no repetir pasos
#La unica diferencia es que al quitarte la columna vehicle_id antes de eliminar las filas con NAs se te eliminan mas, esto tambien se puede conseguir
#modificando el treshold de %NAs en el descriptivo (x ej a 35%)
df = pd.read_parquet('../data/full_data_initial.parquet')
df.drop('vehicle_id', axis=1, inplace=True)
df

Unnamed: 0,year,month,weekday,hour,vehicles_involved,crash_type,crash_place,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_sex,passenger_age,passenger_role,passenger_safety,passenger_type,fatality
0,1999,1,1,20,2.0,34,,1,5,3,03,06,9.0,1.0,41.0,11,,1,0
1,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,1.0,19.0,11,,1,0
2,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,0.0,20.0,13,02,2,0
3,1999,1,1,8,1.0,01,,5,3,6,18,01,13.0,1.0,46.0,11,,1,0
4,1999,1,1,8,1.0,01,,5,3,6,18,NN,,1.0,5.0,99,,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850156,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,1.0,44.0,11,02,1,0
5850157,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,1.0,34.0,13,02,2,0
5850158,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,0.0,35.0,11,02,1,0
5850159,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,1.0,26.0,13,02,2,0


In [40]:
# crear columna size

In [22]:
### HAY QUE MIRAR ESTO. SI LO HACEMOS, CAMBIA LA MORTALIDAD GENERAL DEL DATASET

#Yo me quedaría también con los peatones (df['passenger_role'] == "99")

# Remove non-driver rows

df = df.loc[(df['passenger_role'] == "11") | (df['passenger_role'] == "99")]

# Drop the role column

df = df.drop('passenger_role', axis = 1)

# por qué hacemos esto? por que el número de personas que van en un coche antes de un accidente no es una variable predecible
# y por lo tanto hemos de tener en cuenta únicamente al conductor, que en un caso hipotético de ponernos en el lugar de una
# empresa aseguradora, será el individuo sobre el cual tendremos información

# Además, nos evita problemas a la hora de entrenar los modelos ya que sería posible que varios registros del mismo coche,
# que inevitablemente tienen una correlación muy alta respecto de la variable objetivo, sesgando nuestros modelos.

In [23]:
# Remove rows with an outlier vehicle_age
df = df.loc[df['vehicle_age'] < 30]

# Removes rows with outlier vehicles_involved
df = df.loc[df['vehicles_involved'] < 6]

In [24]:
#Transform month column to quarters
df['month'] = (df['month']-1)//3 + 1
df = df.rename({'month': 'quarter'}, axis= 1)

In [25]:
# Transform weekday column
df['weekday'] = df['weekday'].replace({6:3, 7:3, 1:2, 4:2, 5:2, 2:1, 3:1})

In [26]:
# Transform hour column
df['hour'] = df['hour'].replace({0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:2, 7:2, 8:2, 9:3, 10:3, 11:3, 11:4, 12:4, 13:4, 14:4, 15:4, 16:4, 17:4, 18:4, 19:5, 20:5, 21:5, 22:6, 23:6})

In [27]:
# Mean encoding
encode_cols = ["crash_type", "crash_place", "crash_weather", "surface_state", "road_slope", "traffic_state", "vehicle_type", "passenger_safety", "passenger_type"]

for i in encode_cols:
    encoder = TargetEncoder()
    df[i] = encoder.fit_transform(df[i], df['fatality']) 

In [28]:
# One-hot-encoding
columnasfecha = ["hour", "weekday", "quarter", "passenger_sex"]

for i in columnasfecha:
    onehotencoder = OneHotEncoder(handle_unknown="ignore")
    array = df[i].values.reshape(-1,1)
    df_temp = onehotencoder.fit_transform(array)
    colnames = onehotencoder.get_feature_names([i])
    df_temp = pd.DataFrame(df_temp.toarray(), columns = colnames)
    df.index = df_temp.index
    df = pd.concat([df_temp, df], axis=1)

# We get rid of the not encoded columns
df = df.drop(columns = columnasfecha)

df = df.drop(columns = "passenger_sex_nan")

In [36]:
new_cols = ['year','quarter_1','quarter_2','quarter_3','quarter_4','weekday_1','weekday_2','weekday_3','hour_1','hour_2','hour_3','hour_4','hour_5','hour_6','vehicles_involved','crash_type','crash_place',
            'crash_weather','surface_state','road_slope','traffic_state','vehicle_type','vehicle_age','passenger_age','passenger_sex_0.F','passenger_sex_1.M','passenger_safety','passenger_type','fatality']
df = df.reindex(columns=new_cols)

In [38]:
df

Unnamed: 0,year,quarter_1,quarter_2,quarter_3,quarter_4,weekday_1,weekday_2,weekday_3,hour_1,hour_2,...,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_age,passenger_sex_0.F,passenger_sex_1.M,passenger_safety,passenger_type,fatality
0,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.032669,0.012502,0.020390,9.0,41.0,,,0.016547,0.014018,0
1,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.032669,0.012502,0.012242,12.0,19.0,,,0.016547,0.014018,0
2,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.028321,0.020684,0.012242,13.0,46.0,,,0.016547,0.014018,0
3,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.012022,0.004459,0.012242,15.0,28.0,,,0.016547,0.014018,0
4,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.012022,0.004459,0.012242,8.0,21.0,,,0.016547,0.014018,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554563,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.018211,0.020684,0.012242,3.0,72.0,,,0.011504,0.014018,0
3554564,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.012022,0.020684,0.012242,20.0,49.0,,,0.011504,0.014018,0
3554565,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.012022,0.020684,0.012242,4.0,20.0,,,0.011504,0.014018,0
3554566,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.012022,0.020684,0.014555,6.0,44.0,,,0.011504,0.014018,0


In [40]:
#Guardamos el df
df.to_parquet("../data/full_data_encoded.parquet")