#### Lib Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from typing import List
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler

#### Data Import

In [2]:
treino = pd.read_parquet('local/treino.parquet')
treino.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,"c8aab885-433d-4e46-8066-479f40ba7fb2, 68d2039c...","1657146417045, 1657146605778, 1657146698738","76, 38, 41","20380, 21184, 35438","50.3, 18.18, 16.46","2, 1, 1","1657146417045, 1657146605778, 1657146698738"
1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,"3325b5a1-979a-4cb3-82b6-63905c9edbe8, fe856057...","1656684240278, 1656761266729, 1656761528085, 1...","7, 80, 2, 1, 7, 62, 26, 44, 4, 4, 14, 45, 13, ...","6049, 210489, 8672, 10000, 30000, 123007, 9965...","25.35, 45.66, 35.3, 28.05, 36.53, 47.57, 55.33...","1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1...","1656684240278, 1656761266729, 1656761528085, 1..."
2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,Logged,107,"04756569-593e-4133-a95a-83d35d43dbbd, 29b6b142...","1656678946256, 1656701076495, 1656701882565, 1...","0, 0, 0, 0, 0, 44, 0, 0, 2, 1, 0, 0, 0, 44, 0,...","311274, 140000, 32515, 157018, 118689, 159243,...","67.58, 47.22, 41.52, 63.09, 51.38, 65.11, 71.9...","1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1...","1656678946256, 1656701076495, 1656701882565, 1..."
3,c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...,Non-Logged,56,"1f2b9c2f-a2d2-4192-b009-09065da8ec23, 04756569...","1658333312180, 1658404553818, 1658408449062, 1...","8, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 1...","182696, 91925, 30000, 273655, 126409, 42980, 1...","58.26, 72.66, 22.57, 59.89, 40.36, 36.35, 14.7...","1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","1658333312180, 1658404553818, 1658408449062, 1..."
4,e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...,Non-Logged,4,"bebdeb3e-1699-43e0-a1b8-989f5a6ab679, f4b484a7...","1658766608801, 1658766608801, 1660084035094, 1...","579, 579, 7, 2","801396, 801396, 10000, 10000","78.74, 78.74, 16.71, 9.34","7, 7, 1, 1","1658766608801, 1658766608801, 1660084035094, 1..."


#### Pipeline Functions

In [3]:
def split_multivalued_df(df: pd.DataFrame, split_columns: list) -> pd.DataFrame:
    df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
    expanded_df = df.explode(split_columns, ignore_index=True)
    return expanded_df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df


def set_time_base_features(df:pd.DataFrame) -> pd.DataFrame:
    decay_rate = 0.0001
    df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')
    max_date = df['timestamp'].max()
    df['date'] = df['timestamp'].dt.date
    df['timeOnPageHistory'] = pd.to_numeric(df['timeOnPageHistory'])
    df['days_since_click'] = (max_date - df['timestamp']).dt.days
    df['day_of_week'] = df['timestamp'].dt.day_name()
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week
    df['time_normalized'] = df['days_since_click'] / df['days_since_click'].max()
    df['time_decay_weight'] = np.exp(-decay_rate * df['time_normalized'])
    df['time_on_page_minutes'] = df['timeOnPageHistory'] / 60000
    return df


def calculate_engagement_score(df:pd.DataFrame) -> pd.DataFrame:
    df['numberOfClicksHistory'] = pd.to_numeric(df['numberOfClicksHistory'])
    df['scrollPercentageHistory'] = pd.to_numeric(df['scrollPercentageHistory'])
    df['pageVisitsCountHistory'] = pd.to_numeric(df['pageVisitsCountHistory'])
    df['time_on_page_minutes'] = pd.to_numeric(df['time_on_page_minutes'])
    df['time_decay_weight'] = pd.to_numeric(df['time_decay_weight'])
    df['engagement_score'] = (
        df['numberOfClicksHistory'] * 0.4 +
        df['scrollPercentageHistory'] * 0.2 +
        df['pageVisitsCountHistory'] * 0.2 +
        df['time_on_page_minutes'] * 0.1 +
        df['time_decay_weight'] * 0.1
    )
    return df


def get_engagement_score_with_PCA(df:pd.DataFrame, interaction_features: list) -> pd.DataFrame:
    df['days_since_click'] = (df['days_since_click'] * -1)
    df['time_decay_weight'] = (df['time_decay_weight'] * -1)
    scaler = StandardScaler()
    X = df[interaction_features]
    scaled_X = scaler.fit_transform(X)
    pca = PCA(n_components=1)
    pca_result = pca.fit_transform(scaled_X)
    df['engagement_score_pca'] = pca_result
    return df


def initial_adjusts(df:pd.DataFrame, )->pd.DataFrame:
    df['userId'] = df['userId'].str.replace(r"\s+", "", regex=True)
    df['history'] = df['history'].str.replace(r"\s+", "", regex=True)
    df['timestampHistory'] = df['timestampHistory'].str.replace(r"\s+", "", regex=True)
    return df

# I suggest mantain final adjusted column without duplicate name, only 'history' because the same pipeline could provide users.parquet file too.
# Same observation for column timestampHistory.
def initial_validation_adjusts(df:pd.DataFrame)->pd.DataFrame:
    df['history'] = df['history'].str.strip()
    df['history_adjusted'] = df['history'].str.replace(' \n ', ',', regex=False)
    df['history_adjusted'] = df['history_adjusted'].str.replace(r"\s+", "", regex=True) 
    df['timestampHistory'] = df['timestampHistory'].str.strip()
    df['timestampHistory_adjusted'] = df['timestampHistory'].str.replace(' ', ',', regex=False)
    df['timestampHistory_adjusted'] = df['timestampHistory_adjusted'].str.replace(r"\s+", "", regex=True) 
    return df


def convert_to_timestamp(df: pd.DataFrame, field: str) -> pd.DataFrame:
    df[field] = pd.to_datetime(df[field], unit='ms')
    df['date'] = df[field].dt.date
    return df



#### Pipeline

In [4]:
## User Pipeline Variables
split_columns = [
    'history', 
    'timestampHistory', 
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory', 
    'timestampHistory_new'
]

columns_to_drop = [
    'userType',
    'historySize', 
    'timestampHistory_new', 
    'timestampHistory', 
    'timeOnPageHistory', 
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory',
    'week_of_year',
    'timestamp',
    'year',
    'days_since_click',
    'day_of_week',
    'hour_of_day',
    'time_normalized',
    'time_decay_weight',
    'time_on_page_minutes'
]

interaction_features = [
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory', 
    'time_on_page_minutes', 
    'time_decay_weight', 
    'days_since_click'
]

validation_split_columns = [
    'history_adjusted',
    'timestampHistory_adjusted'
]

validation_drop_columns = [
    'timestampHistory',
    'history'
]


columns_to_drop_cluster = [
    'userType',
    'historySize', 
    'timestampHistory_new', 
    'timestampHistory', 
    'timeOnPageHistory', 
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory',
    'timestamp',
    'days_since_click',
    'day_of_week',
    'hour_of_day',
    'time_normalized',
    'time_decay_weight',
    'time_on_page_minutes'
]


validacao_time_field = 'timestampHistory_adjusted'
train_time_field = 'timestampHistory'

In [5]:
users_pipeline = Pipeline(
    steps=[
        ('initial_adjustments_in_train', FunctionTransformer(initial_adjusts)),
        ('split_multivalued_df', FunctionTransformer(
            split_multivalued_df, 
            kw_args={'split_columns': split_columns})),
        ('create_time_features', FunctionTransformer(set_time_base_features)),
        ('set_engagement_score_with_pca', FunctionTransformer(
            get_engagement_score_with_PCA, 
            kw_args={'interaction_features': interaction_features})),
        ('set_engagement_score_with_formula', FunctionTransformer(calculate_engagement_score))
    ]
)

validation_pipeline = Pipeline(
    steps=[
        ('initial_adjustments_in_validation', FunctionTransformer(initial_validation_adjusts)),
        ('split_multivalued_columns', FunctionTransformer(
            split_multivalued_df,
            kw_args={'split_columns': validation_split_columns})),
        ('convert_timestamp_field', FunctionTransformer(
            convert_to_timestamp,
            kw_args={'field': validacao_time_field})),
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': validation_drop_columns}))
    ]
)

clustered_recomendation_features = Pipeline(
    steps=[
        ('initial_adjustments_in_train', FunctionTransformer(initial_adjusts)),
        ('split_multivalued_df', FunctionTransformer(
            split_multivalued_df, 
            kw_args={'split_columns': split_columns})),
        ('convert_timestamp_field', FunctionTransformer(
            convert_to_timestamp,
            kw_args={'field': train_time_field})),
        ('create_time_features', FunctionTransformer(set_time_base_features)),
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': columns_to_drop_cluster}))
        
    ]
)

#### Pipeline Run

##### Test Run

In [8]:
df_test_pipeline = treino.head()
users_df = users_pipeline.transform(df_test_pipeline)
users_df.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new,...,days_since_click,day_of_week,hour_of_day,year,week_of_year,time_normalized,time_decay_weight,time_on_page_minutes,engagement_score_pca,engagement_score
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,1657146417045,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.339667,0.035902,40.793976
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,68d2039c-c9aa-456c-ac33-9b2e8677fba7,1657146605778,38,21184,18.18,1,1657146605778,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.353067,-1.261832,18.971316
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,13e423ce-1d69-4c78-bc18-e8c8f7271964,1657146698738,41,35438,16.46,1,1657146698738,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.590633,-1.129264,19.851072
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,3325b5a1-979a-4cb3-82b6-63905c9edbe8,1656684240278,7,6049,25.35,1,1656684240278,...,-44,Friday,14,2022,26,1.0,-0.9999,0.100817,-1.557986,7.980092
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,fe856057-f97d-419f-ab1c-97c5c3e0719c,1656761266729,80,210489,45.66,1,1656761266729,...,-43,Saturday,11,2022,26,0.977273,-0.999902,3.50815,1.295993,41.582825


##### PROD Run

In [9]:
df_run = treino.copy()
# users_df = users_pipeline.transform(treino)
# users_df.head()

users_df_cluster = users_pipeline.transform(df_run)
users_df_cluster.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new,...,days_since_click,day_of_week,hour_of_day,year,week_of_year,time_normalized,time_decay_weight,time_on_page_minutes,engagement_score_pca,engagement_score
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,1657146417045,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.339667,0.417061,40.793976
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,68d2039c-c9aa-456c-ac33-9b2e8677fba7,1657146605778,38,21184,18.18,1,1657146605778,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.353067,-0.344065,18.971316
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,13e423ce-1d69-4c78-bc18-e8c8f7271964,1657146698738,41,35438,16.46,1,1657146698738,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.590633,-0.162837,19.851072
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,3325b5a1-979a-4cb3-82b6-63905c9edbe8,1656684240278,7,6049,25.35,1,1656684240278,...,-44,Friday,14,2022,26,1.0,-0.9999,0.100817,-0.835794,7.980092
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,fe856057-f97d-419f-ab1c-97c5c3e0719c,1656761266729,80,210489,45.66,1,1656761266729,...,-43,Saturday,11,2022,26,0.977273,-0.999902,3.50815,2.108754,41.582825


In [10]:
out_filepath = 'local/user_full_features.parquet'
users_df_cluster.to_parquet(out_filepath, index=False)

##### Users Validation

In [6]:
validacao = pd.read_parquet('local/validacao.parquet')
validacao.head()

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8 \n 01c...,1660533136590 1660672113513
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9,1660556860253
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1660561649242
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85 \n 835...,1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6 \n b8e...,1660548813953 1660572329731 1660594848200


In [7]:
validacao_df = validation_pipeline.transform(validacao)
validacao_df.head()

Unnamed: 0,userId,userType,history_adjusted,timestampHistory_adjusted,date
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,2022-08-15 03:12:16.590,2022-08-15
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52,2022-08-16 17:48:33.513,2022-08-16
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9,2022-08-15 09:47:40.253,2022-08-15
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,2022-08-15 11:07:29.242,2022-08-15
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,2022-08-15 03:23:50.245,2022-08-15


In [9]:
validacao_df['date'].unique()

array([datetime.date(2022, 8, 15), datetime.date(2022, 8, 16),
       datetime.date(2022, 8, 17)], dtype=object)

In [7]:
out_filepath = 'local/validtion_cleanned.parquet'
validacao_df.to_parquet(out_filepath, index=False)

In [11]:
itens = pd.read_parquet('local/itens_text_db_scan.parquet')
itens.head()

Unnamed: 0,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
0,13db0ab1-eea2-4603-84c4-f40a876c7400,Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",0
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,As expressões santarenas não significam apenas...,NEUTRAL,0.450133,NEUTRAL,0.68394,Linguajar dos santarenos é diferenciado e chei...,"[0.05066423, 0.053637918, -0.06246449, -0.0591...",1
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,Ex-primeiro-ministro foi atingido por tiros de...,NEGATIVE,0.840971,NEGATIVE,0.845363,Ex-premiê Shinzo Abe morre após ser baleado no...,"[-0.04080393, 0.08898491, -0.041833814, -0.010...",2
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,Ministro defendeu que posse indígena é diferen...,NEGATIVE,0.721313,NEUTRAL,0.766454,"Relator no STF, Fachin vota contra marco tempo...","[-0.02016749, 0.13216536, 0.03784579, -0.01504...",0
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"Pelo marco temporal, índios só podem reivindic...",NEGATIVE,0.844091,NEUTRAL,0.708964,"Após 2 votos, pedido de vista suspende julgame...","[0.031508345, 0.10205304, 0.03989872, 0.017708...",0


In [12]:
# strictly necessary before merge
itens['page'] = itens['page'].str.replace(r"\s+", "", regex=True)

In [13]:
merged_df = pd.merge(validacao_df, itens, how='left', left_on='history_adjusted', right_on='page')
merged_df.head()

Unnamed: 0,userId,userType,history_adjusted,timestampHistory_adjusted,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,2022-08-15 03:12:16.590,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,"\nApesar do susto, Bianca Macanoni contou ao g...",NEGATIVE,0.81462,NEGATIVE,0.461391,VÍDEO: vestido de aniversariante pega fogo em ...,"[-0.009848203, 0.042258054, 0.043710895, -0.06...",0
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52,2022-08-16 17:48:33.513,01c59ff6-fb82-4258-918f-2910cb2d4c52,Brasil ainda está construindo rede e aparatos ...,NEGATIVE,0.776603,NEUTRAL,0.690279,"Varíola dos macacos: 2,9 mil casos são investi...","[0.08205641, 0.005297369, -0.037946, 0.0132754...",5
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9,2022-08-15 09:47:40.253,77901133-aee7-4f7b-afc0-652231d76fe9,Sedema vai monitorar cumprimento de acordo e o...,NEGATIVE,0.80161,NEUTRAL,0.583618,Após reclamações de moradores e reunião com MP...,"[0.03565619, -0.017987702, -0.09888334, -0.039...",0
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,2022-08-15 11:07:29.242,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,Publicação ensina como selecionar as aves e fa...,NEUTRAL,0.590334,NEUTRAL,0.723503,Ovos caipiras: cartilha mostra boas práticas d...,"[0.010637219, 0.0516447, -0.054130264, 0.02671...",0
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,2022-08-15 03:23:50.245,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,"Segundo a Polícia Civil, policiais militares a...",NEGATIVE,0.838438,NEGATIVE,0.839168,Jovem é encontrado morto após ser filmado send...,"[-0.021044988, 0.007185048, -0.04042115, -0.05...",0


In [23]:
print(merged_df['userId'][2])

d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdeee870d49b897e1e99cd


In [28]:
user = 'd0afad7ea843d86597d822f0df1d39d31a3fea7c39fdeee870d49b897e1e99cd'
result = merged_df[['userId', 'page']][merged_df['userId']==user]

In [29]:
print(result['page'].head())

2    77901133-aee7-4f7b-afc0-652231d76fe9
Name: page, dtype: object


In [None]:
print(178868 - 112184)

In [None]:
merged_df.tail()

In [None]:
print(merged_df['userId'][178863])

In [None]:
users_df['history'] = users_df['history'].str.strip()
users_df.head()

In [None]:
merged_train_df = pd.merge(users_df, itens, how='left', left_on='history', right_on='page')
merged_train_df.head()

In [None]:
grouped_train = merged_train_df.groupby(['userId', 'classes'])['engagement_score'].mean().reset_index()
grouped_train.head()

In [None]:
filtered_df = grouped_train[grouped_train['userId'] == 'aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce51f7f66044a927bdbaa']
filtered_df.head()


In [None]:
filtered_validation_df = merged_df[['userId', 'page', 'classes']][merged_df['userId'] == 'aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce51f7f66044a927bdbaa']
filtered_validation_df