# Feature engineering

## Cargando librerías

In [1]:
import pandas as pd
import pickle
from datetime import date, timedelta
import numpy as np

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

### Funciones extra, en utils

In [3]:
def generate_label(df):
    """
    Crea en el data frame de los datos la variable label que es 1
    cuando el código de cierre es 'Pass','Pass w/ Conditions', 0 en caso de 'Fail'.
    :param: dataframe
    :return: dataframe
    """
    df['label'] = np.where(df.results.isin(['Pass','Pass w/ Conditions']), 1, 0)
    return df

## Funciones de ingesta y transformación

In [4]:
# Funciones solicitadas

def load_ingestion(path='output/ingest_df.pkl'):
    """
    Recibe el path en donde se encuentra el pickle que generamos durante la ingestión.
    :param: path
    :return: pickle
    """
    # Recibe ingest_df.pkl
    df_pkl = pickle.load(open(path, "rb"))
    return df_pkl
    # utils function
    #load_df(path)

def date_transformation(col, df):
    """
    Recibe la columna que hay que transformar a DATE y el data frame al que pertenece.
    :param: column, dataframe
    :return: column
    """
    return pd.to_datetime(df[col])

def numeric_transformation(col, df):
    """
    Recibe la columna que hay que transformar a NUMERIC (entera) y el data frame al que pertenece.
    :param: column, dataframe
    :return: column
    """
    return df[col].astype(float) 

def int_transformation(col, df):
    """
    Recibe la columna que hay que transformar a NUMERIC (entera) y el data frame al que pertenece.
    :param: column, dataframe
    :return: column
    """
    return df[col].astype(int)

def categoric_trasformation(col, df):
    """
    Recibe la columna que hay que transformar a CATEGORICA y el data frame al que pertenece.
    :param: column, dataframe
    :return: column
    """
    return df[col].astype(str) 

def hours_trasformation(col, df):
    """
    Recibe la columna que hay que transformar a HORA (hh:mm:ss) y el data frame al que pertenece.
    :param: column, dataframe
    :return: column
    """
    # Convierte a formato hh:mm:ss aquellos datos que vienen con formato decimal
    df[col] = np.where(df[col].str.contains('\.'),\
                                   df[col].apply(decimal_hr_to_hhmmss),\
                                   df[col])    
    return df


def split_fecha(col, df):
    """
    Recibe la columna fecha que hay que transformar en 3 columnas: año, mes, dia y 
    el data frame al que pertenece.
    :param: column, dataframe
    :return: dataframe con 3 columnas mas (hora, minuto y segundo)
    """
    #df[col + '_año'] = df[col].dt.year.astype(str) 
    df[col + '_mes'] = df[col].dt.month.astype(str) 
    df[col + '_dia'] = df[col].dt.day.astype(str)     
    
    # Cambio a enteros
    df[col + '_mes'] = int_transformation(col + '_mes', df)
    df[col + '_dia'] = int_transformation(col + '_dia', df)
    # Como strings
    #df[col + '_mes'] = df[col + '_mes'].apply(two_dig)
    #df[col + '_dia'] = df[col + '_dia'].apply(two_dig)
    
    return df
  
def save_transformation(df, path='output/transformation_df.pkl'):
    """
    Guarda en formato pickle (ver notebook feature_engineering.ipynb)
    el data frame que ya tiene los datos transformados.
    El pickle se debe llamar transformation_df.pkl y se debe guardar
    en la carpeta output.
    :param: dataframe, path
    :return: file save
    """
    pickle.dump(df, open(path, "wb"))
    # utils function, debería guardar el pickle llamado transformation_df.pkl en la carpeta ouput
    # save_df(df, path)

## Funciones de feature enginering

In [5]:
# Funciones solicitadas

def load_transformation(path='output/transformation_df.pkl'): 
    """
    Recibe el path en donde se encuentra el pickle que generamos 
    durante la transformación.
    """
    df_pkl = pickle.load(open(path, "rb"))
    return df_pkl
    # utils function
    #load_df(path)
    
def feature_generation(data): 
    """
    Recibe el data frame que contiene las variables a partir de las cuales 
    crearemos nuevas variables. Estas nuevas variables se guardarán en este 
    mismo data frame.
    """    
    
    # Para crear variables ciclícas
    data = ciclic_variables('day_of_week', data)
    data = ciclic_variables('week', data)
    data = ciclic_variables('inspection_date_mes', data)
    data = ciclic_variables('inspection_date_dia', data)
    
    # Se quita 'dba_name','address'
    # Se quedan sin transformar 'inspection_date','label'
    
    # Variables posibles a transformar
    """
    'facility_type','risk','city','state','zip','results','latitude','longitude',
    'sin_day_no','cos_day_no','sin_week','cos_week','sin_month','cos_month','sin_days','cos_days'
    """
    
    # Variables a transformar
    data_input = pd.DataFrame(data,
                              columns=['risk','zip','results','latitude','longitude',
                                       'sin_day_no','cos_day_no','sin_week','cos_week',
                                       'sin_month','cos_month','sin_days','cos_days'])
    # Transformaciones
    transformers_2 = [('one_hot', OneHotEncoder(), ['risk','zip','results']),
                      ('min_max', MinMaxScaler(), ['latitude','longitude',
                                                   'sin_day_no','cos_day_no','sin_week','cos_week',
                                                   'sin_month','cos_month','sin_days','cos_days'])]

    col_trans_2 = ColumnTransformer(transformers_2, remainder="drop", n_jobs=-1, verbose=True)
    col_trans_2.fit(data_input)

    input_vars = col_trans_2.transform(data_input)

    cols = ['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
            '60601', '60602', '60603', '60604', '60605', '60606', '60607', '60608',
            '60609', '60610', '60611', '60612', '60613', '60614', '60615', '60616',
            '60617', '60618', '60619', '60620', '60621', '60622', '60623', '60624',
            '60625', '60626', '60627', '60628', '60629', '60630', '60631', '60632',
            '60633', '60634', '60636', '60637', '60638', '60639', '60640', '60641',
            '60642', '60643', '60644', '60645', '60646', '60647', '60649', '60651',
            '60652', '60653', '60654', '60655', '60656', '60657', '60659', '60660',
            '60661', '60666', '60707', '60827',             
            'Fail', 'Pass', 'Pass w/ Conditions',
            'latitude','longitude',
            'sin_day_no','cos_day_no','sin_week','cos_week',
            'sin_month','cos_month','sin_days','cos_days']

    # Información del dataframe final
    df_final = pd.DataFrame(input_vars.todense())
    df_final.columns = cols    
        
    df_final['inspection_date']= data['inspection_date']
    df_final['label']= data['label']

    return input_vars, df_final
   
#PENDIENTE
"""  
def feature_selection(input_vars): 
    """
    Recibe el data frame que contiene las variables de las cuales haremos
    una selección.
    """
    X = input_vars
    y = data.label.values.reshape(input_vars.shape[0], )

    np.random.seed(20201124)

    # ocuparemos un RF
    classifier = RandomForestClassifier(oob_score=True, random_state=1234)
    # separando en train, test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # definicion de los hiperparametros que queremos probar, se tarda aprox 3 horas en correr
    hyper_param_grid = {'n_estimators': [100, 300],
                        'max_depth': [1, 10, 15],
                        'min_samples_split': [2, 5]}

    # ocupemos grid search!
    gs = GridSearchCV(classifier,
                      hyper_param_grid,
                      scoring='precision',
                      cv=3,
                      n_jobs=2)

    # ejecutando el RF
    start_time = time.time()
    gs.fit(X_train, y_train)
    print("El proceso en segundos duro: ", time.time() - start_time)
    print("Mejores parámetros: " + str(gs.best_params_))
    print("Score:" + str(print(gs.best_score_)))
    best_e = gs.best_estimator_
    print("Mejor estimador: " + str(best_e))
    print("Mejor estimador observado: " + str(gs.best_estimator_.oob_score_))

    # Importancia de los parámetros
    feature_importance = pd.DataFrame({'importance': best_e.feature_importances_,
                                       'feature': list(final_df.columns)})
    print("Importancia de los parámetros")
    print(feature_importance.sort_values(by="importance", ascending=False))

    # Salvando el mejor modelo obtenido
    save_fe(best_e, path='../output/feature_selection_model.pkl')

    # Regresando dataframe con los features que ocuparemos.
    # En este caso las variables que aportan más del 7% de información son:
    # incidentes_c4, tipo_entrada, latitud y longitud (las 2 primeras se pasaran
    # en OneHotEncoding)
    final_df = final_df[['accidente-ciclista',
                         'BOTÓN DE AUXILIO',
                         'latitud', 
                         'longitud'
                         'RADIO', 
                         'accidente-choque con prensados',
                         'lesionado-atropellado',
                         'LLAMADA DEL 911',
                         'LLAMADA DEL 066',
                         'cos_hr_creacion', 'sin_hr_creacion',
                         'sin_week_no', 'cos_week_no',
                          'sin_day_no', 'cos_day_no'
                         ]]    

    return final_df
"""
    
def save_fe(df, path='output/fe_df.pkl'):
    """
    Guarda en formato pickle (ver notebook feature_engineering.ipynb) el data frame 
    que ya tiene los features que ocuparemos. El pickle se debe llamar fe_df.pkl y 
    se debe guardar en la carpeta output.
    """    
    # pickle.dump(df, open(path, "wb"))
    # utils function, debería guardar el picjle llamado fe_df.pkl en la carpeta ouput
    save_df(df, path)
    

# --------------------- Funciones Auxiliares ---------------------------    
def ciclic_variables(col, df):
    """
    Recibe la columna day_no, mes o fecha_creacion y las convierte en variables cíclicas:
    número día de la semana, mes, semana y hora respectivamente.
    :param: column, dataframe
    :return: dataframe con variable cíclica creada corresondientemente
    """
    
    if (col == 'day_of_week'):
        no_dia_semana = {'Sunday':1, 'Monday':2, 'Tuesday':3, 'Wednesday':4, 
                         'Thursday':5, 'Friday':6, 'Saturday':7}
        df['day_no'] = df[col].apply(lambda x: no_dia_semana[x])
        #max_day_no = np.max(df['day_no'])
        max_day_no = 7
        df['sin_day_no'] = np.sin(2*np.pi*df['day_no']/max_day_no)
        df['cos_day_no'] = np.cos(2*np.pi*df['day_no']/max_day_no)
        
    if(col == 'week'):
        # converting the hour into a sin, cos coordinate
        WEEKS = 53
        df['sin_week'] = np.sin(2*np.pi*df[col]/WEEKS)
        df['cos_week'] = np.cos(2*np.pi*df[col]/WEEKS) 
        
    if(col == 'inspection_date_mes'):
        MONTH = 12
        df['sin_month'] = np.sin(2*np.pi*df[col]/MONTH)
        df['cos_month'] = np.cos(2*np.pi*df[col]/MONTH) 
        
    if(col == 'inspection_date_dia'):
        # converting the hour into a sin, cos coordinate
        DAYS = 31
        df['sin_days'] = np.sin(2*np.pi*df[col]/DAYS)
        df['cos_days'] = np.cos(2*np.pi*df[col]/DAYS)    
            
    return df
    


# Carga de datos

In [6]:
data = pickle.load(open('output/2020-02-02_.pkl', "rb"))
data.head()

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2484780,RED STAR LIQUORS,RED STAR LIQUORS,2432531,Grocery Store,Risk 2 (Medium),2725-2727 N MILWAUKEE AVE,CHICAGO,IL,60647,2021-02-11T00:00:00.000,Complaint Re-Inspection,Pass,57. ALL FOOD EMPLOYEES HAVE FOOD HANDLER TRAIN...,41.93078708970233,-87.70998753402094,"{'latitude': '-87.70998753402094', 'longitude'..."
1,2484768,MARZ COMMINITY BREWING CO.,MARZ COMMUNITY BREWING CO.,2770949,Liquor,Risk 3 (Low),1950 N WESTERN AVE,CHICAGO,IL,60647,2021-02-11T00:00:00.000,License,Pass,"53. TOILET FACILITIES: PROPERLY CONSTRUCTED, S...",41.917134801370274,-87.68754385201615,"{'latitude': '-87.68754385201615', 'longitude'..."
2,2484766,LA FORTALEZA DE ARCELIA GUERRERO,LA FORTALEZA DE ARCELIA GUERRERO,2575370,Restaurant,Risk 1 (High),5958 W DIVERSEY AVE,CHICAGO,IL,60639,2021-02-11T00:00:00.000,Complaint Re-Inspection,Pass w/ Conditions,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...,41.93124981809584,-87.7759069982422,"{'latitude': '-87.7759069982422', 'longitude':..."
3,2484759,Jimmy Johns,Jimmy Johns,1898470,Restaurant,Risk 1 (High),51 W DIVISION ST,CHICAGO,IL,60610,2021-02-11T00:00:00.000,Canvass Re-Inspection,Pass,,41.903806014540685,-87.63058113117621,"{'latitude': '-87.63058113117621', 'longitude'..."
4,2484775,CAFE TOLA #4,CAFE TOLA,2627325,Restaurant,Risk 1 (High),2489 N MILWAUKEE AVE,CHICAGO,IL,60647,2021-02-11T00:00:00.000,Canvass Re-Inspection,Pass,,41.92663502691596,-87.70319925259288,"{'latitude': '-87.70319925259288', 'longitude'..."


In [7]:
# Nos quedamos sólo con los resultados 'Pass','Pass w/ Conditions', 'Fail
df = data[data.results.isin(['Pass','Pass w/ Conditions', 'Fail'])]

In [8]:
# Podemos tomar sólo los de tipo Canvass (Sondeo)
df = df[df.inspection_type.isin(['Canvass'])]

In [9]:
# Generamos el label
df = generate_label(df)

In [10]:
# Quitamos NA
df1 = df[df['facility_type'].notna()]
df1 = df1[df1['risk'].notna()]
df1 = df1[df1['city'].notna()]
df1 = df1[df1['state'].notna()]
df1 = df1[df1['zip'].notna()]
df1 = df1[df1['latitude'].notna()]
df1 = df1[df1['longitude'].notna()]
df1 = df1[df1['location'].notna()]

In [11]:
df1.isna().sum()

inspection_id         0
dba_name              0
aka_name            306
license_             10
facility_type         0
risk                  0
address               0
city                  0
state                 0
zip                   0
inspection_date       0
inspection_type       0
results               0
violations         3664
latitude              0
longitude             0
location              0
label                 0
dtype: int64

In [12]:
# Nos quedamos con las variables para predecir 
df_final = df1[['facility_type','risk','address','zip',
                'inspection_date','results','latitude','longitude','label']]
df_final.head()


Unnamed: 0,facility_type,risk,address,zip,inspection_date,results,latitude,longitude,label
12,School,Risk 1 (High),5165 S State ST,60609,2021-02-10T00:00:00.000,Pass,41.80001174659164,-87.62569446789126,1
13,Restaurant,Risk 3 (Low),3012 W CERMAK RD,60623,2021-02-10T00:00:00.000,Pass,41.85193888133392,-87.70089765429343,1
15,Restaurant,Risk 1 (High),3323 W NORTH AVE,60647,2021-02-10T00:00:00.000,Fail,41.90995972342911,-87.71042949751607,0
24,Restaurant,Risk 1 (High),1760 W CHICAGO AVE,60622,2021-02-10T00:00:00.000,Fail,41.89610267284695,-87.67211594992528,0
26,Restaurant,Risk 1 (High),3018 W CERMAK RD,60623,2021-02-10T00:00:00.000,Pass,41.85193447241584,-87.70113231549328,1


In [13]:
df_final.isna().sum()

facility_type      0
risk               0
address            0
zip                0
inspection_date    0
results            0
latitude           0
longitude          0
label              0
dtype: int64

In [14]:
# Observamos duplicados
duplicate = df_final[df_final.duplicated()] 
duplicate

Unnamed: 0,facility_type,risk,address,zip,inspection_date,results,latitude,longitude,label
62,Restaurant,Risk 2 (Medium),11601 W TOUHY AVE,60666,2021-02-08T00:00:00.000,Pass,42.008536400868735,-87.91442843927047,1
199,Restaurant,Risk 1 (High),7435 W TALCOTT AVE,60631,2021-02-02T00:00:00.000,Pass,41.98857518279947,-87.81297791219501,1
273,Grocery Store,Risk 1 (High),424 W DIVISION ST,60610,2021-02-01T00:00:00.000,Pass,41.90388396114024,-87.63943720155201,1
410,Restaurant,Risk 1 (High),108 E SUPERIOR ST,60611,2021-01-27T00:00:00.000,Pass,41.89583963820686,-87.62498814131581,1
439,Restaurant,Risk 1 (High),108 E SUPERIOR ST,60611,2021-01-27T00:00:00.000,Pass,41.89583963820686,-87.62498814131581,1
...,...,...,...,...,...,...,...,...,...
216073,Restaurant,Risk 1 (High),222 W MERCHANDISE MART PLZ,60654,2010-01-07T00:00:00.000,Pass,41.8880742810662,-87.63495520292739,1
216080,Restaurant,Risk 1 (High),222 W MERCHANDISE MART PLZ,60654,2010-01-07T00:00:00.000,Pass,41.8880742810662,-87.63495520292739,1
216089,Restaurant,Risk 1 (High),2800 W IRVING PARK RD,60618,2010-01-07T00:00:00.000,Pass,41.954083317714684,-87.69830277007019,1
216101,Restaurant,Risk 1 (High),900 N NORTH BRANCH ST,60642,2010-01-07T00:00:00.000,Pass,41.89896533484934,-87.64809026360946,1


In [15]:
# Quitamos duplicados
df_final = df_final.drop_duplicates()
df_final

Unnamed: 0,facility_type,risk,address,zip,inspection_date,results,latitude,longitude,label
12,School,Risk 1 (High),5165 S State ST,60609,2021-02-10T00:00:00.000,Pass,41.80001174659164,-87.62569446789126,1
13,Restaurant,Risk 3 (Low),3012 W CERMAK RD,60623,2021-02-10T00:00:00.000,Pass,41.85193888133392,-87.70089765429343,1
15,Restaurant,Risk 1 (High),3323 W NORTH AVE,60647,2021-02-10T00:00:00.000,Fail,41.90995972342911,-87.71042949751607,0
24,Restaurant,Risk 1 (High),1760 W CHICAGO AVE,60622,2021-02-10T00:00:00.000,Fail,41.89610267284695,-87.67211594992528,0
26,Restaurant,Risk 1 (High),3018 W CERMAK RD,60623,2021-02-10T00:00:00.000,Pass,41.85193447241584,-87.70113231549328,1
...,...,...,...,...,...,...,...,...,...
216251,School,Risk 1 (High),2010 N CENTRAL PARK AVE BLDG,60647,2010-01-05T00:00:00.000,Pass,41.91762127382847,-87.71692852023781,1
216255,GAS STATION/RESTAURANT,Risk 2 (Medium),970 W PERSHING RD,60609,2010-01-05T00:00:00.000,Pass w/ Conditions,41.82350359266238,-87.65066637652201,1
216256,School,Risk 1 (High),4219 N LINCOLN AVE,60618,2010-01-05T00:00:00.000,Pass,41.95838817038278,-87.68155929632958,1
216258,Restaurant,Risk 1 (High),3050-3052 W 26TH ST,60623,2010-01-05T00:00:00.000,Pass,41.844686619118384,-87.70218762267689,1


In [16]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87508 entries, 12 to 216264
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   facility_type    87508 non-null  object
 1   risk             87508 non-null  object
 2   address          87508 non-null  object
 3   zip              87508 non-null  object
 4   inspection_date  87508 non-null  object
 5   results          87508 non-null  object
 6   latitude         87508 non-null  object
 7   longitude        87508 non-null  object
 8   label            87508 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 6.7+ MB


### Iniciamos transformaciones de datos

In [17]:
# Categoricas
#df_final['dba_name'] = categoric_trasformation('dba_name', df_final)
df_final['facility_type'] = categoric_trasformation('facility_type', df_final)
df_final['risk'] = categoric_trasformation('risk', df_final)
df_final['address'] = categoric_trasformation('address', df_final)
#df_final['city'] = categoric_trasformation('city', df_final)
#df_final['state'] = categoric_trasformation('state', df_final)
df_final['zip'] = categoric_trasformation('zip', df_final)
# Númericas 
df_final['latitude'] = numeric_transformation('latitude', df_final)
df_final['longitude'] = numeric_transformation('longitude', df_final)

In [18]:
# Transformando fechas.
df_final['inspection_date'] = date_transformation('inspection_date', df_final)
# Dividiendo fecha en mes y día
df_final = split_fecha("inspection_date", df_final)
df_final['week'] = df_final['inspection_date'].dt.week
df_final['day_of_week'] = df_final['inspection_date'].dt.day_name()
#df_final.info()

  """


In [19]:
# Ordenando datos por fecha de inspección
df_final = df_final.sort_values(['inspection_date'])
df_final

Unnamed: 0,facility_type,risk,address,zip,inspection_date,results,latitude,longitude,label,inspection_date_mes,inspection_date_dia,week,day_of_week
216264,Restaurant,Risk 1 (High),6 E CHESTNUT ST,60611,2010-01-04,Fail,41.898431,-87.628009,0,1,4,1,Monday
216194,Restaurant,Risk 1 (High),4852 S PULASKI RD,60632,2010-01-05,Pass,41.804468,-87.723633,1,1,5,1,Tuesday
216200,Restaurant,Risk 1 (High),7148 N HARLEM AVE,60631,2010-01-05,Pass,42.011357,-87.806788,1,1,5,1,Tuesday
216210,School,Risk 1 (High),3616 W ARMITAGE AVE,60647,2010-01-05,Pass,41.917384,-87.717464,1,1,5,1,Tuesday
216216,Restaurant,Risk 2 (Medium),30 W ERIE ST,60654,2010-01-05,Pass,41.894165,-87.629389,1,1,5,1,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,Restaurant,Risk 1 (High),3018 W CERMAK RD,60623,2021-02-10,Pass,41.851934,-87.701132,1,2,10,6,Wednesday
24,Restaurant,Risk 1 (High),1760 W CHICAGO AVE,60622,2021-02-10,Fail,41.896103,-87.672116,0,2,10,6,Wednesday
15,Restaurant,Risk 1 (High),3323 W NORTH AVE,60647,2021-02-10,Fail,41.909960,-87.710429,0,2,10,6,Wednesday
13,Restaurant,Risk 3 (Low),3012 W CERMAK RD,60623,2021-02-10,Pass,41.851939,-87.700898,1,2,10,6,Wednesday


In [20]:
df_final = df_final.reset_index(drop=True)
df_final

Unnamed: 0,facility_type,risk,address,zip,inspection_date,results,latitude,longitude,label,inspection_date_mes,inspection_date_dia,week,day_of_week
0,Restaurant,Risk 1 (High),6 E CHESTNUT ST,60611,2010-01-04,Fail,41.898431,-87.628009,0,1,4,1,Monday
1,Restaurant,Risk 1 (High),4852 S PULASKI RD,60632,2010-01-05,Pass,41.804468,-87.723633,1,1,5,1,Tuesday
2,Restaurant,Risk 1 (High),7148 N HARLEM AVE,60631,2010-01-05,Pass,42.011357,-87.806788,1,1,5,1,Tuesday
3,School,Risk 1 (High),3616 W ARMITAGE AVE,60647,2010-01-05,Pass,41.917384,-87.717464,1,1,5,1,Tuesday
4,Restaurant,Risk 2 (Medium),30 W ERIE ST,60654,2010-01-05,Pass,41.894165,-87.629389,1,1,5,1,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87503,Restaurant,Risk 1 (High),3018 W CERMAK RD,60623,2021-02-10,Pass,41.851934,-87.701132,1,2,10,6,Wednesday
87504,Restaurant,Risk 1 (High),1760 W CHICAGO AVE,60622,2021-02-10,Fail,41.896103,-87.672116,0,2,10,6,Wednesday
87505,Restaurant,Risk 1 (High),3323 W NORTH AVE,60647,2021-02-10,Fail,41.909960,-87.710429,0,2,10,6,Wednesday
87506,Restaurant,Risk 3 (Low),3012 W CERMAK RD,60623,2021-02-10,Pass,41.851939,-87.700898,1,2,10,6,Wednesday


In [21]:
# Obteniendo encabezados en one hot encoder
pd.get_dummies(df_final.facility_type).columns

Index(['(convenience store)', '1023', '1023 CHILDERN'S SERVICE FACILITY',
       '1023 CHILDERN'S SERVICE S FACILITY',
       '1023 CHILDERN'S SERVICES FACILITY',
       '1023 CHILDREN'S SERVICES FACILITY',
       '1023-CHILDREN'S SERVICES FACILITY', '15 monts to 5 years old',
       'A-Not-For-Profit Chef Training Program', 'ADULT DAYCARE',
       ...
       'newsstand', 'night club', 'school cafeteria', 'smoothie bar',
       'snack shop', 'tavern', 'theater', 'video store', 'weight loss program',
       'youth housing'],
      dtype='object', length=309)

In [22]:
#df_final.facility_type.to_csv('facility_type.csv') # relative position

In [23]:
pd.get_dummies(df_final.risk).columns

Index(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)'], dtype='object')

In [24]:
#pd.get_dummies(df_final.city).columns #Podemos quitar esta variable

In [25]:
#pd.get_dummies(df_final.state).columns #Podemos quitar esta variable

In [26]:
pd.get_dummies(df_final.zip).columns

Index(['60601', '60602', '60603', '60604', '60605', '60606', '60607', '60608',
       '60609', '60610', '60611', '60612', '60613', '60614', '60615', '60616',
       '60617', '60618', '60619', '60620', '60621', '60622', '60623', '60624',
       '60625', '60626', '60627', '60628', '60629', '60630', '60631', '60632',
       '60633', '60634', '60636', '60637', '60638', '60639', '60640', '60641',
       '60642', '60643', '60644', '60645', '60646', '60647', '60649', '60651',
       '60652', '60653', '60654', '60655', '60656', '60657', '60659', '60660',
       '60661', '60666', '60707', '60827'],
      dtype='object')

In [27]:
pd.get_dummies(df_final.results).columns

Index(['Fail', 'Pass', 'Pass w/ Conditions'], dtype='object')

In [28]:
len(df_final["facility_type"].str.lower().str.strip().unique())
df_final["facility_type"].str.lower().str.strip().unique()

array(['restaurant', 'school', 'grocery store', 'gas station/restaurant',
       'catering', 'bakery', 'daycare (2 - 6 years)', 'hospital',
       'daycare above and under 2 years', 'daycare (under 2 years)',
       'shelter', 'golden diner', 'liquor',
       "children's services facility", 'wholesale', 'long term care',
       'nursing home', 'after school program', 'daycare (2 years)',
       'shared kitchen', 'incubator', 'coffee shop', 'daycare combo 1586',
       'restaurant/bar', 'theater', 'tavern', 'assisted living',
       'long-term care facility', 'assissted living', 'cooking school',
       'pool', 'gas station', 'long term care facility', 'special event',
       'candy/gelato', 'convenience/drug store', 'kiosk', 'commissary',
       'culinary school', 'culinary arts school', 'grocery(sushi prep)',
       'banquet hall', 'public shcool', 'grocery/butcher',
       'private school', 'mobile food dispenser', 'grocery/restaurant',
       'bakery/restaurant', 'ice cream shop',
 

In [29]:
df_final["facility_type"] = df_final["facility_type"].str.lower().str.strip()

In [30]:
prop_codigo = df_final.groupby(['facility_type'], as_index=False)['inspection_date']\
.count()\
.rename(columns={'inspection_date': 'count'})
prop_codigo['prop'] = prop_codigo['count']/np.sum(prop_codigo['count'])
prop_codigo.sort_values(by = ['prop'], ascending = False).head(60)

Unnamed: 0,facility_type,count,prop
219,restaurant,60200,0.687937
236,school,9611,0.10983
129,grocery store,8889,0.101579
16,bakery,1473,0.016833
54,children's services facility,1184,0.01353
89,daycare (2 - 6 years),835,0.009542
177,long term care,832,0.009508
93,daycare above and under 2 years,830,0.009485
45,catering,602,0.006879
124,golden diner,441,0.00504


In [31]:
in_v, fe_df = feature_generation(df_final)
fe_df

Unnamed: 0,Risk 1 (High),Risk 2 (Medium),Risk 3 (Low),60601,60602,60603,60604,60605,60606,60607,...,sin_day_no,cos_day_no,sin_week,cos_week,sin_month,cos_month,sin_days,cos_days,inspection_date,label
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.000000,3.568959e-01,0.559163,0.996487,0.750000,0.933013,0.862862,0.844083,2010-01-04,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.722521,5.551115e-17,0.559163,0.996487,0.750000,0.933013,0.924867,0.763876,2010-01-05,1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.722521,5.551115e-17,0.559163,0.996487,0.750000,0.933013,0.924867,0.763876,2010-01-05,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.722521,5.551115e-17,0.559163,0.996487,0.750000,0.933013,0.924867,0.763876,2010-01-05,1
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.722521,5.551115e-17,0.559163,0.996487,0.750000,0.933013,0.924867,0.763876,2010-01-05,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87503,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.277479,0.000000e+00,0.826554,0.878649,0.933013,0.750000,0.949479,0.277951,2021-02-10,1
87504,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.277479,0.000000e+00,0.826554,0.878649,0.933013,0.750000,0.949479,0.277951,2021-02-10,0
87505,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.277479,0.000000e+00,0.826554,0.878649,0.933013,0.750000,0.949479,0.277951,2021-02-10,0
87506,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.277479,0.000000e+00,0.826554,0.878649,0.933013,0.750000,0.949479,0.277951,2021-02-10,1


In [35]:
in_v

<87508x76 sparse matrix of type '<class 'numpy.float64'>'
	with 1075594 stored elements in Compressed Sparse Row format>