In [62]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve

from sklearn.metrics import accuracy_score, roc_auc_score, \
                            classification_report, confusion_matrix
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from boruta import BorutaPy

In [63]:
#Importamos los datos
df = pd.read_csv("../data/NCDB_1999_to_2014.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [64]:
# Cambio de nombre de las variables
new_names = ["year", "month", "weekday", "hour", "fatality", "vehicles_involved", "crash_type", "crash_place", "crash_weather",
"surface_state", "road_slope", "traffic_state", "vehicle_id", "vehicle_type", "vehicle_year", "passenger_id", "passenger_sex",
"passenger_age", "passenger_role", "passenger_fatality", "passenger_safety", "passenger_type"]

df.columns = new_names

In [65]:
# El dataset contiene variables que no están disponible en el momento de la colisión y por tanto escapan al objetivo de nuestro ánalisis y
# otras que directamente no son de útilidad como IDs. También debemos prescindir de registros duplicados antes de eliminar los IDs
df = df.drop_duplicates().drop(["passenger_id","passenger_fatality"], axis=1)

In [66]:
# Recodeado de la variable objetivo
# Antes: 1-fatality 2-no fatality
# Después: 1-fatality 0-no fatality
df['fatality'] = df['fatality'].replace({2:0})

In [67]:
#Ponemos la variable objetivo a la derecha del dataframe
new_columns = list(df.columns[0:4]) + list(df.columns[5:]) + list(df.columns[4:5])
df = df[new_columns]

In [68]:
df = df.drop("vehicle_id", axis=1)

In [69]:
# Remplazamos U, UU y UUUU por NAs
df = df.replace({"U": np.nan, "UU": np.nan, "UUUU": np.nan})

In [70]:
#Eliminando los registros con >40% nulos
null_rows = df.isnull().sum(axis=1).sort_values(ascending=False)
nulos_filas = pd.DataFrame(null_rows, columns=['nulos_filas'])  
nulos_filas['target'] = df['fatality'].copy()
nulos_filas['porcentaje_filas']= nulos_filas['nulos_filas']/df.shape[1]
print(nulos_filas)
nulos_40 = list(nulos_filas.index[nulos_filas.porcentaje_filas>=0.40])
df = df.loc[set(df.index)-set(nulos_40)].reset_index()

         nulos_filas  target  porcentaje_filas
5245711           16       1          0.842105
5245712           16       1          0.842105
769190            13       0          0.684211
2884064           13       0          0.684211
2177439           12       0          0.631579
...              ...     ...               ...
2665912            0       0          0.000000
2665913            0       0          0.000000
2665914            0       0          0.000000
2665915            0       0          0.000000
2930693            0       0          0.000000

[5855336 rows x 3 columns]


In [71]:
# Cambio de la variable vehicle_year a años de antigüedad y eliminamos vehicle_year
# Cambiamos passenger_age y vehicles_involved a númericas ninguna de las dos tiene NAs. 
# Sin embargo, passenger_age tiene valores NN para transformar la variable los haremos NA con 'coerce'
df['vehicle_year'] = pd.to_numeric(df.year - pd.to_numeric(df.vehicle_year, errors= "coerce"))
df = df.rename(columns={"vehicle_year": 'vehicle_age'})
df['passenger_age'] = pd.to_numeric(df['passenger_age'],"coerce")
df['vehicles_involved'] = pd.to_numeric(df['vehicles_involved'],"ignore")
df.drop(inplace=True, axis=1, columns='index')

In [72]:
# Month a numerico. Limpieza de valores 0
df["month"] = np.int8(df["month"].replace({"01": 1, "02": 2, "11": 11, "12" : 12}))
df = df[df["month"] != 0]

In [73]:
# Weekday a numerico. Limpieza de valores 0
df["weekday"] = np.int8(df["weekday"].replace({"7": 7, "1": 1, "2": 2, "3" : 3, "4":4, "5":5, "6":6}))
df = df[df["weekday"] != 0]

In [74]:
# Hour a numerico. Limpieza de valores nulos
df = df[df["hour"].notnull()]
df["hour"] = df["hour"].astype("int8")

In [75]:
# Replace M/F to 1/0
df.passenger_sex.replace({"M":1, "F":0}, inplace=True)

# Remove NA values (15k rows, 0.3% of total sample)
df.passenger_sex.replace('[^0-9]+',np.nan,regex=True,inplace=True)
df.passenger_sex.dropna(inplace=True)

In [76]:
# Drop passenger_safety==11, very low sample
df = df.loc[df['passenger_safety'] != "11"]

In [77]:
df

Unnamed: 0,year,month,weekday,hour,vehicles_involved,crash_type,crash_place,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_sex,passenger_age,passenger_role,passenger_safety,passenger_type,fatality
0,1999,1,1,20,2.0,34,,1,5,3,03,06,9.0,1.0,41.0,11,,1,0
1,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,1.0,19.0,11,,1,0
2,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,0.0,20.0,13,02,2,0
3,1999,1,1,8,1.0,01,,5,3,6,18,01,13.0,1.0,46.0,11,,1,0
4,1999,1,1,8,1.0,01,,5,3,6,18,NN,,1.0,5.0,99,,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850191,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,1.0,44.0,11,02,1,0
5850192,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,1.0,34.0,13,02,2,0
5850193,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,0.0,35.0,11,02,1,0
5850194,2014,12,7,19,1.0,03,01,1,3,4,18,01,13.0,1.0,26.0,13,02,2,0


In [102]:
#FIXME: Pickle file size bigger than csv, pruebo con parquet
# Yo cargaría el dataframe, para no repetir pasos
#La unica diferencia es que al quitarte la columna vehicle_id antes de eliminar las filas con NAs se te eliminan mas, esto tambien se puede conseguir
#modificando el treshold de %NAs en el descriptivo (x ej a 35%)
df = pd.read_parquet('../data/full_data_initial.parquet')
df

MemoryError: Unable to allocate 44.2 MiB for an array with shape (5790467,) and data type int64

In [92]:
### HAY QUE MIRAR ESTO. SI LO HACEMOS, CAMBIA LA MORTALIDAD GENERAL DEL DATASET

#Yo me quedaría también con los peatones (df['passenger_role'] == "99")

# Remove non-driver rows

df = df.loc[(df['passenger_role'] == "11") | (df['passenger_role'] == "99")]

# Drop the role column

df = df.drop('passenger_role', axis = 1)
# por qué hacemos esto? por que el número de personas que van en un coche antes de un accidente no es una variable predecible
# y por lo tanto hemos de tener en cuenta únicamente al conductor, que en un caso hipotético de ponernos en el lugar de una
# empresa aseguradora, será el individuo sobre el cual tendremos información

# Además, nos evita problemas a la hora de entrenar los modelos ya que sería posible que varios registros del mismo coche,
# que inevitablemente tienen una correlación muy alta respecto de la variable objetivo, sesgando nuestros modelos.

In [93]:
df

Unnamed: 0_level_0,year,month,weekday,hour,vehicles_involved,crash_type,crash_place,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_sex,passenger_age,passenger_safety,passenger_type,fatality
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1999,1,1,20,2.0,34,,1,5,3,03,06,9.0,1.0,41.0,,1,0
1,1999,1,1,20,2.0,34,,1,5,3,03,01,12.0,1.0,19.0,,1,0
3,1999,1,1,8,1.0,01,,5,3,6,18,01,13.0,1.0,46.0,,1,0
4,1999,1,1,8,1.0,01,,5,3,6,18,NN,,1.0,5.0,,3,0
5,1999,1,1,17,3.0,QQ,QQ,1,2,1,01,01,15.0,1.0,28.0,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850152,2014,12,7,0,1.0,04,01,2,2,1,18,01,20.0,1.0,49.0,02,1,0
5850153,2014,12,7,18,1.0,02,01,2,5,1,18,01,4.0,1.0,20.0,02,1,0
5850154,2014,12,7,18,1.0,02,01,2,5,1,18,NN,,0.0,81.0,01,3,0
5850156,2014,12,7,14,2.0,41,01,1,5,1,18,05,6.0,1.0,44.0,02,1,0


In [94]:
# crear columna passenger count
#Una forma sería unir tablas por el indice en df está intacto en df2 está reseteado
#Esto solo se podria hacer si partimos del mismo df guardado del descrptive
df2 = pd.read_parquet('../data/vehicle_data_initial.parquet')
df2

Unnamed: 0,year,month,weekday,hour,vehicles_involved,crash_type,crash_place,crash_weather,surface_state,road_slope,traffic_state,vehicle_type,vehicle_age,fatality,passenger_count
0,1999,1,1,0,1.0,01,01,1,4,1,18,06,8.0,0,1
1,1999,1,1,0,1.0,01,01,1,5,1,18,01,18.0,0,1
2,1999,1,1,0,1.0,01,01,2,1,1,18,01,9.0,0,1
3,1999,1,1,0,1.0,06,01,2,5,3,18,QQ,2.0,0,2
4,1999,1,1,0,2.0,21,02,1,5,1,01,06,10.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3043170,2014,12,7,23,2.0,35,01,1,2,1,01,01,1.0,0,2
3043171,2014,12,7,23,2.0,35,01,1,2,1,01,01,6.0,0,1
3043172,2014,12,7,23,3.0,21,02,1,5,1,01,01,15.0,0,4
3043173,2014,12,7,23,3.0,21,02,1,5,1,01,01,16.0,0,2


### Outliers

In [95]:
# Remove rows with an outlier vehicle_age
df = df.loc[df['vehicle_age'] < 30]

# Removes rows with outlier vehicles_involved
df = df.loc[df['vehicles_involved'] < 6]

In [96]:
#Transform month column to quarters
df['month'] = (df['month']-1)//3 + 1
df = df.rename({'month': 'quarter'}, axis= 1)

In [97]:
# Transform weekday column
df['weekday'] = df['weekday'].replace({6:3, 7:3, 1:2, 4:2, 5:2, 2:1, 3:1})

In [98]:
# Transform hour column
df['hour'] = df['hour'].replace({0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:2, 7:2, 8:2, 9:3, 10:3, 11:3, 11:4, 12:4, 13:4, 14:4, 15:4, 16:4, 17:4, 18:4, 19:5, 20:5, 21:5, 22:6, 23:6})

In [99]:
# Mean encoding
encode_cols = ["crash_type", "crash_place", "crash_weather", "surface_state", "road_slope", "traffic_state", "vehicle_type", "passenger_safety", "passenger_type"]

for i in encode_cols:
    encoder = TargetEncoder()
    df[i] = encoder.fit_transform(df[i], df['fatality']) 

In [101]:
# One-hot-encoding
columnasfecha = ["hour", "weekday", "quarter", "passenger_sex"]

for i in columnasfecha:
    onehotencoder = OneHotEncoder(handle_unknown="ignore")
    array = df[i].values.reshape(-1,1)
    df_temp = onehotencoder.fit_transform(array)
    colnames = onehotencoder.get_feature_names([i])
    df_temp = pd.DataFrame(df_temp.toarray(), columns = colnames)
    df.index = df_temp.index
    df = pd.concat([df_temp, df], axis=1)

# We get rid of the not encoded columns
df = df.drop(columns = columnasfecha)

df = df.drop(columns = "passenger_sex_nan")

MemoryError: Unable to allocate 786. MiB for an array with shape (29, 3554568) and data type float64

In [None]:
df

In [88]:
new_cols = ['year','quarter_1','quarter_2','quarter_3','quarter_4','weekday_1','weekday_2','weekday_3','hour_1','hour_2','hour_3','hour_4','hour_5','hour_6','vehicles_involved','crash_type','crash_place',
            'crash_weather','surface_state','road_slope','traffic_state','vehicle_type','vehicle_age','passenger_age','passenger_sex_0.F','passenger_sex_1.M','passenger_safety','passenger_type','fatality']
df = df.reindex(columns=new_cols)

In [90]:
df

Unnamed: 0,year,quarter_1,quarter_2,quarter_3,quarter_4,weekday_1,weekday_2,weekday_3,hour_1,hour_2,...,road_slope,traffic_state,vehicle_type,vehicle_age,passenger_age,passenger_sex_0.F,passenger_sex_1.M,passenger_safety,passenger_type,fatality
0,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.032669,0.012502,0.020390,9.0,41.0,,,0.016547,0.014018,0
1,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.032669,0.012502,0.012242,12.0,19.0,,,0.016547,0.014018,0
2,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.028321,0.020684,0.012242,13.0,46.0,,,0.016547,0.014018,0
3,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.012022,0.004459,0.012242,15.0,28.0,,,0.016547,0.014018,0
4,1999,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.012022,0.004459,0.012242,8.0,21.0,,,0.016547,0.014018,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554563,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.018211,0.020684,0.012242,3.0,72.0,,,0.011504,0.014018,0
3554564,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.012022,0.020684,0.012242,20.0,49.0,,,0.011504,0.014018,0
3554565,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.012022,0.020684,0.012242,4.0,20.0,,,0.011504,0.014018,0
3554566,2014,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.012022,0.020684,0.014555,6.0,44.0,,,0.011504,0.014018,0


In [None]:
#Guardamos el df
#df.to_parquet("../data/full_data_encoded.parquet")

# Selección de Variables

In [None]:
X = df.drop('fatality', axis = 1)
Y = df.fatality

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
scaler = StandardScaler()
model_scaled = scaler.fit(xtrain)
xtrain_scaled = pd.DataFrame(scaler.transform(xtrain), columns=xtrain.columns, index=xtrain.index)
xtest_scaled = pd.DataFrame(scaler.transform(xtest), columns=xtest.columns, index=xtest.index)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


### Regulariación  Ridge
Debemos transformar los NAs primero

In [None]:
sel_ridge = SelectFromModel(LogisticRegression(C=1, penalty='l2'), threshold = 0.02)
sel_ridge.fit(xtrain_scaled, ytrain)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Coeficientes del modelo
# ==============================================================================
df_coeficientes = pd.DataFrame(
                        {'predictor': xtrain_scaled.columns,
                         'coef': sel_ridge.estimator_.coef_.flatten()}
                  )

fig, ax = plt.subplots(figsize=(16, 3.84))
ax.stem(df_coeficientes.predictor, df_coeficientes.coef, markerfmt=' ')
plt.xticks(rotation=90, ha='right', size=10)
ax.set_xlabel('variable')
ax.set_ylabel('coeficientes')
ax.set_title('Coeficientes del modelo ridge');

### Boruta

In [None]:
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, max_depth=6) # cambiar los hiperparametros

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)

# find all relevant features - 5 features should be selected
feat_selector.fit(xtrain.values, ytrain) # Importante poner el .values al dataframe, sino dará error

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').