Notebook Created to test the data pipeline

In [1]:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

For performance reasons you may prefer to download the parquet and add in your local folder to work.

If you prefer doing this, please create a folder called local inside 'challenfe_files'

In [2]:
itens = pd.read_parquet('local/itens.parquet')
teste = pd.read_parquet('local/teste.parquet')
treino = pd.read_parquet('local/treino.parquet')
validacao = pd.read_parquet('local/validacao.parquet')
validacao_k = pd.read_parquet('local/validacao_k.parquet')

In [3]:
print(treino.columns)

Index(['userId', 'userType', 'historySize', 'history', 'timestampHistory',
       'numberOfClicksHistory', 'timeOnPageHistory', 'scrollPercentageHistory',
       'pageVisitsCountHistory', 'timestampHistory_new'],
      dtype='object')


In [4]:

# Add the below functions to python (maybe utils?)

def split_multivalued_df(df: pd.DataFrame, split_columns: list) -> pd.DataFrame:
    df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
    expanded_df = df.explode(split_columns, ignore_index=True)
    return expanded_df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df


def merge_dfs(
        df_a: pd.DataFrame,
        df_b: pd.DataFrame,
        df_a_key: str,
        df_b_key: str) -> pd.DataFrame:
    merged_df = pd.merge(df_a, df_b, left_on=df_a_key, right_on=df_b_key, how='inner')
    return merged_df

Notes:

    1 - I have confirmed that timestampHistory_new and timestampHistory have the same value always so no reason to keep timestampHistory_new 

Pipeline Test

**TO-DO:**

*We need to review how to add the below pipeline in our project*

In [5]:
# pipeline kwargs
split_columns = ['history', 'timestampHistory', 'numberOfClicksHistory', 'timeOnPageHistory', 'scrollPercentageHistory', 'pageVisitsCountHistory']
columns_to_drop = ['userType', 'historySize', 'timestampHistory_new']
df_a_key = 'history'
df_b_key = 'page'

pipeline = Pipeline(
    steps=[
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': columns_to_drop})),
        ('split_multivalued_df', FunctionTransformer(split_multivalued_df, kw_args={'split_columns': split_columns})),
        ('merge', FunctionTransformer(merge_dfs, kw_args={'df_b': itens, 'df_a_key': df_a_key, 'df_b_key': df_b_key}))
    ]
)

In [6]:
new_df = pipeline.transform(treino)
new_df.head()

Unnamed: 0,userId,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,page,url,issued,modified,title,body,caption
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,c8aab885-433d-4e46-8066-479f40ba7fb2,http://g1.globo.com/sc/santa-catarina/noticia/...,2022-03-19 21:03:21+00:00,2022-03-19 21:03:21+00:00,"Você viu? Musa das Estradas faz vídeo de pé,...",Caminhoneira Aline Füchter em pé em casa\nRepr...,Caminhoneira Aline Füchter ficou em pé em fren...
1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,1656684240278,7,6049,25.35,1,3325b5a1-979a-4cb3-82b6-63905c9edbe8,http://g1.globo.com/sp/itapetininga-regiao/not...,2022-08-14 20:17:10+00:00,2022-08-14 20:17:11+00:00,Agosto Lilás: Itapetininga promove palestras d...,Itapetininga promove palestras de conscientiza...,"Segunda prefeitura, durante mês de agosto, pal..."
2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,04756569-593e-4133-a95a-83d35d43dbbd,1656678946256,0,311274,67.58,1,04756569-593e-4133-a95a-83d35d43dbbd,http://g1.globo.com/mg/minas-gerais/noticia/20...,2022-08-05 13:36:54+00:00,2022-08-12 16:55:46+00:00,"Estou arrependido. Fui covarde , diz homem su...",Thales foi preso em motel de BH após matar ex\...,"Segundo polícia, Thales Thomás do Vale estava ..."
3,c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...,1f2b9c2f-a2d2-4192-b009-09065da8ec23,1658333312180,8,182696,58.26,1,1f2b9c2f-a2d2-4192-b009-09065da8ec23,http://g1.globo.com/rj/rio-de-janeiro/noticia/...,2022-08-12 09:49:53+00:00,2022-08-12 13:07:13+00:00,"VÍDEO: ‘Me ajuda, por favor! Não! Socorro!’, s...","Me ajuda, por favor! , suplica influenciadora...","Namorado afirmava que, se ela fugisse, ele iri..."
4,e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...,bebdeb3e-1699-43e0-a1b8-989f5a6ab679,1658766608801,579,801396,78.74,7,bebdeb3e-1699-43e0-a1b8-989f5a6ab679,http://g1.globo.com/economia/noticia/2022/08/1...,2022-08-12 08:30:35+00:00,2022-08-12 08:30:36+00:00,Como fazer o empréstimo do Auxílio Brasil?,Piso do Auxílio Brasil será de R$ 600 até o fi...,Governo aprovou Medida Provisória que permite ...


In [7]:
validacao.head()

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8 \n 01c...,1660533136590 1660672113513
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9,1660556860253
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1660561649242
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85 \n 835...,1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6 \n b8e...,1660548813953 1660572329731 1660594848200


In [8]:
validacao_k.head()

Unnamed: 0,userId,history,relevance
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,2
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,01c59ff6-fb82-4258-918f-2910cb2d4c52,1
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,77901133-aee7-4f7b-afc0-652231d76fe9,1
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,5


In [9]:
teste.head()

Unnamed: 0,userId,acessos_futuros
0,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,e67c8cdf-3c55-4399-a864-3c1591225296
1,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,68ed45c8-71b7-4b88-bcde-1695d741aa42
2,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,6f81f339-dc75-4cb1-b12f-d20a344ae64d
3,1505326617b9465f6e13eb1d0d9782bff2af61822a7bc7...,61e07f64-cddf-46f2-b50c-ea0a39c22050
4,9ade38ffe62f55863100f505b9b9be170f7b50c36ca6b5...,esid:conteudo_editorial_g1#materia#https://esp...
