In [1]:
def prev_comex_ncm(
    comex_ncm,cat_cols=['CO_MES','nick','SG_UF_NCM','CO_PAIS','CO_URF'],target_cols=['oc','KG_LIQUIDO'],md_c=15,md_r=1
                  ):
    
    '''For Comexstat table format, considering the years repetions of treatments, predict the first and last years of the years range
    with only one training session. There are two steps in the process. The first, named classification, classify if some import will
    happen or not. To do that, it transform the table in a sparse matrix and then flag with (1) the register of a real import ocurrance,
    and with (2) the register os a non ocuurence. The speed of the process is relevant since it will be the core of many trials, result
    of the combinations of the features. To speed up, only combinatios of features that already occur is allowed to be multiplied in a
    sparse matrix transformation.'''

    # Load ML algoritms
    from sklearn.ensemble import RandomForestClassifier as RFC_
    from sklearn.ensemble import RandomForestRegressor as RFR_

    # Classificação ##########################################################################################
    
    # Sparse Matrix 
    # Take dates of the begin and the end of the file
    pred_min=date_ini=comex_ncm.date.min()
    pred_max=date_fim=comex_ncm.date.max()

    # Matriz 1
    anos=pd.DataFrame({'CO_ANO':np.arange(comex_ncm.CO_ANO.max(),comex_ncm.CO_ANO.max()+1),'key':0})
    
    # Matriz 2
    cat=comex_ncm[cat_cols].drop_duplicates().assign(key=0)
    
    # Multiply
    c_c=anos.merge(cat,how='outer').drop(columns='key')        
    comex_ncm=c_c.merge(comex_ncm,how='outer').fillna(0)
    
    # Rebuilt date column
    comex_ncm['date']=pd.to_datetime(comex_ncm.CO_ANO.astype(str)+'/'+comex_ncm.CO_MES.astype(str)+'/1')

    # Trim dates
    comex_ncm=comex_ncm[(comex_ncm.date<=date_fim)&(comex_ncm.date>=date_ini)]
    
    # Flag occurances
    comex_ncm=comex_ncm.assign(oc=0)
    comex_ncm.loc[comex_ncm[target_cols[1]]>0,'oc']=1
    
    # Dummies code the features
    comex_ncm[cat_cols]=comex_ncm[cat_cols].astype('category')    
    comex_ncm_dumm=pd.get_dummies(comex_ncm, prefix_sep='~')
    
    # Date as type integer replace the year column
    '''with the advantage to order the months sequence to the algoritms'''
    comex_ncm_dumm.date=comex_ncm_dumm.date.astype(int)
    comex_ncm_dumm.drop(columns='CO_ANO',inplace=True)

    # Split train, test_max & test_min
    def train_test_max_test_min(df, pred_max, pred_min):
        '''Split train, test_max & test_min'''
        train=df[(df.date.astype('datetime64[ns]')>pred_min)&(df.date.astype('datetime64[ns]')<pred_max)]
        test_max=df[df.date.astype('datetime64[ns]')==pred_max]
        test_min=df[df.date.astype('datetime64[ns]')==pred_min]    
        return [train,test_max,test_min]
    [train_c,test_max_c,test_min_c]=train_test_max_test_min(comex_ncm_dumm,pred_max, pred_min)

    # Separa em X e y
    def Xy(train,test_max, test_min, col_target,col_del):
        X_train=train.drop(columns=[col_target]+[col_del])
        y_train=train[col_target]
        X_test_max=test_max.drop(columns=[col_target]+[col_del])
        y_test_max=test_max[col_target]
        X_test_min=test_min.drop(columns=[col_target]+[col_del])
        y_test_min=test_min[col_target]
        return [X_train,y_train,X_test_max,y_test_max,X_test_min,y_test_min]
    [X_train_c,y_train_c,X_test_max_c,y_test_max_c,X_test_min_c,y_test_min_c]=Xy(train_c,test_max_c,test_min_c,target_cols[0],target_cols[1])

    # Fit and Predict
    def fit_pred(RF,md,X_train,y_train,X_test_max,y_test_max,X_test_min,y_test_min, col_target):
        # Fit
        RF=RF(max_depth=md)
        RF.fit(X_train,y_train)
    
        # Prediz Max
        test_pred_max=X_test_max.assign(col1=RF.predict(X_test_max)).rename(columns={'col1':str(col_target)+'_pred'})
        test_pred_max=test_pred_max.assign(col2=y_test_max).rename(columns={'col2':(str(col_target))})
    
        # Prediz min
        test_pred_min=X_test_min.assign(col1=RF.predict(X_test_min)).rename(columns={'col1':str(col_target)+'_pred'})
        test_pred_min=test_pred_min.assign(col2=y_test_min).rename(columns={'col2':(str(col_target))})
    
        return [test_pred_max,test_pred_min]
    list_test_pred_c=fit_pred(RFC_,md_c,X_train_c,y_train_c,X_test_max_c,y_test_max_c,X_test_min_c,y_test_min_c,target_cols[0])
    test_pred_max_c,test_pred_min_c=list_test_pred_c




    
    # Regressão ##########################################################################################
    
    # Filter the train and tests to keep just the real occurence of import (undoing the saparse matrix)
    train_r=train_c[train_c.oc==1]
    test_max_r=test_pred_max_c[(test_pred_max_c.oc==1)&(test_pred_max_c.oc_pred==1)]
    test_max_r=test_pred_min_c[(test_pred_min_c.oc==1)&(test_pred_min_c.oc_pred==1)]

    # Decode the dumm codification of features for train and tests sets
    def decode(df):
        df=pd.concat([
        df.select_dtypes(exclude='bool'),
        pd.from_dummies(df.select_dtypes(include='bool'),sep='~')],axis=1)
        return df
    train_r=decode(train_r)
    test_max_r=decode(test_max_r)
    test_max_r=decode(test_max_r)

    # Indetify as a filter the tests features combinations that were prediceted to occur by the classification process
    filtro=pd.concat([test_max_r,test_max_r])[cat_cols].drop_duplicates()
    # Filter the train set allowing only predicted to occur features combinations
    train_r=train_r.merge(filtro)

    '''As the train and tests came from classifications files, it does not have quantitative column. So, as follow, this
    info is bring back with a merging action with the original comex file'''
    comex_ncm.date=comex_ncm.date.astype('int')
    test_max_r=test_max_r.merge(comex_ncm)
    test_max_r=test_max_r.merge(comex_ncm)

    # Concat the train and tests, producing a new comex_file that's only contains predicted to occur features combinations
    comex_ncm_r=pd.concat([train_r,test_max_r,test_max_r])

    # Dummies code teh features
    comex_ncm[cat_cols]=comex_ncm[cat_cols].astype('category')    
    comex_ncm_dumm=pd.get_dummies(comex_ncm, prefix_sep='~')

    # DAte to integer format (again)
    comex_ncm_dumm.date=comex_ncm_dumm.date.astype(int)

    # Split train and tests
    [train_r,test_max_r,test_min_r]=train_test_max_test_min(comex_ncm_dumm,pred_max, pred_min)
    
    # Split X e y
    [X_train_r,y_train_r,X_test_max_r,y_test_max_r,X_test_min_r,y_test_min_r]=Xy(train_r,test_max_r,test_min_r,target_cols[1],target_cols[0])
    
    # Predict
    list_test_pred_r=fit_pred(RFR_,md_r,X_train_r,y_train_r,X_test_max_r,y_test_max_r,X_test_min_r,y_test_min_r,target_cols[1])
    test_pred_max_r,test_pred_min_r=list_test_pred_r
    
    # Concat tests on dates min and max
    test_pred_c=pd.concat([test_pred_max_c,test_pred_min_c])
    test_pred_r=pd.concat([test_pred_max_r,test_pred_min_r])
    
    # Merge classifications and regressions results
    test_pred=test_pred_c.merge(test_pred_r)
    
    # Decode dummies
    test_pred=decode(test_pred)
    
    # Return the result in just one file
    return test_pred

In [3]:
# Print to inform the availability of the function and its structure of inputs
print(
    "prev_comex_ncm(\n\
    comex_ncm,\n\
    cat_cols=['CO_MES','nick','SG_UF_NCM','CO_PAIS','CO_URF'],\n\
    target_cols=['oc','KG_LIQUIDO'],\n\
    md_c=15,\n\
    md_r=1\n\
                  )"
)

prev_comex_ncm(
    comex_ncm,
    cat_cols=['CO_MES','nick','SG_UF_NCM','CO_PAIS','CO_URF'],
    target_cols=['oc','KG_LIQUIDO'],
    md_c=15,
    md_r=1
                  )


In [None]:
# Advise the algoritms loaded
print("RandomForestClassifier as RFC_\n\nRandomForestRegressor as RFR_")