In [25]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

In [26]:
#Importamos los datos
df = pd.read_csv("../data/NCDB_1999_to_2014.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
# Cambio de nombre de las variables
new_names = ["year", "month", "weekday", "hour", "fatality", "vehicles_involved", "crash_type", "crash_place", "crash_weather",
"surface_state", "road_slope", "traffic_state", "vehicle_id", "vehicle_type", "vehicle_year", "passenger_id", "passenger_sex",
"passenger_age", "passenger_role", "passenger_fatality", "passenger_safety", "passenger_type"]

df.columns = new_names

In [28]:
# El dataset contiene variables que no están disponible en el momento de la colisión y por tanto escapan al objetivo de nuestro ánalisis y
# otras que directamente no son de útilidad como IDs. También debemos prescindir de registros duplicados antes de eliminar los IDs
df = df.drop_duplicates().drop(["passenger_id","passenger_fatality"], axis=1)

In [29]:
# Recodeado de la variable objetivo
# Antes: 1-fatality 2-no fatality
# Después: 1-fatality 0-no fatality
df['fatality'] = df['fatality'].replace({2:0})

In [30]:
#Ponemos la variable objetivo a la derecha del dataframe
new_columns = list(df.columns[0:4]) + list(df.columns[5:]) + list(df.columns[4:5])
df = df[new_columns]

In [31]:
df = df.drop("vehicle_id", axis=1)

In [32]:
# Remplazamos U, UU y UUUU por NAs
df = df.replace({"U": np.nan, "UU": np.nan, "UUUU": np.nan})

In [33]:
#Eliminando los registros con >40% nulos
null_rows = df.isnull().sum(axis=1).sort_values(ascending=False)
nulos_filas = pd.DataFrame(null_rows, columns=['nulos_filas'])  
nulos_filas['target'] = df['fatality'].copy()
nulos_filas['porcentaje_filas']= nulos_filas['nulos_filas']/df.shape[1]
print(nulos_filas)
nulos_40 = list(nulos_filas.index[nulos_filas.porcentaje_filas>=0.40])
df = df.loc[set(df.index)-set(nulos_40)].reset_index()

         nulos_filas  target  porcentaje_filas
5245711           16       1          0.842105
5245712           16       1          0.842105
769190            13       0          0.684211
2884064           13       0          0.684211
2177439           12       0          0.631579
...              ...     ...               ...
2665912            0       0          0.000000
2665913            0       0          0.000000
2665914            0       0          0.000000
2665915            0       0          0.000000
2930693            0       0          0.000000

[5855336 rows x 3 columns]


In [34]:
# Cambio de la variable vehicle_year a años de antigüedad y eliminamos vehicle_year
# Cambiamos passenger_age y vehicles_involved a númericas ninguna de las dos tiene NAs. 
# Sin embargo, passenger_age tiene valores NN para transformar la variable los haremos NA con 'coerce'
df['vehicle_year'] = pd.to_numeric(df.year - pd.to_numeric(df.vehicle_year, errors= "coerce"))
df = df.rename(columns={"vehicle_year": 'vehicle_age'})
df['passenger_age'] = pd.to_numeric(df['passenger_age'],"coerce")
df['vehicles_involved'] = pd.to_numeric(df['vehicles_involved'],"ignore")
df.drop(inplace=True, axis=1, columns='index')

In [35]:
# Month a numerico. Limpieza de valores 0
df["month"] = np.int8(df["month"].replace({"01": 1, "02": 2, "11": 11, "12" : 12}))
df = df[df["month"] != 0]

In [36]:
# Weekday a numerico. Limpieza de valores 0
df["weekday"] = np.int8(df["weekday"].replace({"7": 7, "1": 1, "2": 2, "3" : 3, "4":4, "5":5, "6":6}))
df = df[df["weekday"] != 0]

In [37]:
# Hour a numerico. Limpieza de valores nulos
df = df[df["hour"].notnull()]
df["hour"] = df["hour"].astype("int8")

In [38]:
# Replace M/F to 1/0
df.passenger_sex.replace({"M":1, "F":0}, inplace=True)

# Remove NA values (15k rows, 0.3% of total sample)
df.passenger_sex.replace('[^0-9]+',np.nan,regex=True,inplace=True)
df.passenger_sex.dropna(inplace=True)

In [39]:
# Drop passenger_safety==11, very low sample
df = df.loc[df['passenger_safety'] != "11"]

In [40]:
# crear columna size

In [41]:
### HAY QUE MIRAR ESTO. SI LO HACEMOS, CAMBIA LA MORTALIDAD GENERAL DEL DATASET


# Remove non-driver rows

df = df.loc[df['passenger_role'] == "11"]

# Drop the role column

df = df.drop('passenger_role', axis = 1)

# por qué hacemos esto? por que el número de personas que van en un coche antes de un accidente no es una variable predecible
# y por lo tanto hemos de tener en cuenta únicamente al conductor, que en un caso hipotético de ponernos en el lugar de una
# empresa aseguradora, será el individuo sobre el cual tendremos información

# Además, nos evita problemas a la hora de entrenar los modelos ya que sería posible que varios registros del mismo coche,
# que inevitablemente tienen una correlación muy alta respecto de la variable objetivo, sesgando nuestros modelos.

In [42]:
# Remove rows with an outlier vehicle_age
df = df.loc[df['vehicle_age'] < 30]

# Removes rows with outlier vehicles_involved
df = df.loc[df['vehicles_involved'] < 6]

In [43]:
#Transform month column to quarters
df['month'] = (df['month']-1)//3 + 1
df = df.rename({'month': 'quarter'}, axis= 1)

In [44]:
# Transform weekday column
df['weekday'] = df['weekday'].replace({6:3, 7:3, 1:2, 4:2, 5:2, 2:1, 3:1})

In [45]:
# Transform hour column
df['hour'] = df['hour'].replace({0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:2, 7:2, 8:2, 9:3, 10:3, 11:3, 11:4, 12:4, 13:4, 14:4, 15:4, 16:4, 17:4, 18:4, 19:5, 20:5, 21:5, 22:6, 23:6})

In [46]:
# Mean encoding
encode_cols = ["crash_type", "crash_place", "crash_weather", "surface_state", "road_slope", "traffic_state", "vehicle_type", "passenger_safety", "passenger_type"]

for i in encode_cols:
    encoder = TargetEncoder()
    df[i] = encoder.fit_transform(df[i], df['fatality']) 

  elif pd.api.types.is_categorical(cols):


In [47]:
# One-hot-encoding
columnasfecha = ["hour", "weekday", "quarter", "passenger_sex"]

for i in columnasfecha:
    onehotencoder = OneHotEncoder(handle_unknown="ignore")
    array = df[i].values.reshape(-1,1)
    df_temp = onehotencoder.fit_transform(array)
    colnames = onehotencoder.get_feature_names([i])
    df_temp = pd.DataFrame(df_temp.toarray(), columns = colnames)
    df.index = df_temp.index
    df = pd.concat([df_temp, df], axis=1)

# We get rid of the not encoded columns
df = df.drop(columns = columnasfecha)

df = df.drop(columns = "passenger_sex_nan")



In [51]:
df

Unnamed: 0,passenger_sex_0,passenger_sex_1,quarter_1,quarter_2,quarter_3,quarter_4,weekday_1,weekday_2,weekday_3,hour_1,...,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_age,passenger_safety,passenger_type,fatality
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.013810,0.015720,0.032669,0.012502,0.020390,9.0,41.0,0.016547,0.014018,0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.013810,0.015720,0.032669,0.012502,0.012242,12.0,19.0,0.016547,0.014018,0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.021311,0.015555,0.028321,0.020684,0.012242,13.0,46.0,0.016547,0.014018,0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.013810,0.012377,0.012022,0.004459,0.012242,15.0,28.0,0.016547,0.014018,0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.013810,0.012377,0.012022,0.004459,0.012242,8.0,21.0,0.016547,0.014018,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554563,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.017696,0.014695,0.018211,0.020684,0.012242,3.0,72.0,0.011504,0.014018,0
3554564,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.017696,0.012377,0.012022,0.020684,0.012242,20.0,49.0,0.011504,0.014018,0
3554565,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.017696,0.015720,0.012022,0.020684,0.012242,4.0,20.0,0.011504,0.014018,0
3554566,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.013810,0.015720,0.012022,0.020684,0.014555,6.0,44.0,0.011504,0.014018,0
