#### Lib Imports

In [14]:
import pandas as pd
import numpy as np
from typing import List
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler

#### Data Import

In [2]:
treino = pd.read_parquet('local/treino.parquet')

#### Pipeline Functions

In [15]:
def split_multivalued_df(df: pd.DataFrame, split_columns: list) -> pd.DataFrame:
    df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
    expanded_df = df.explode(split_columns, ignore_index=True)
    return expanded_df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df



def set_time_base_features(df:pd.DataFrame) -> pd.DataFrame:
    decay_rate = 0.0001
    df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')
    max_date = df['timestamp'].max()
    df['timeOnPageHistory'] = pd.to_numeric(df['timeOnPageHistory'])
    df['days_since_click'] = (max_date - df['timestamp']).dt.days
    df['day_of_week'] = df['timestamp'].dt.day_name()
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['time_normalized'] = df['days_since_click'] / df['days_since_click'].max()
    df['time_decay_weight'] = np.exp(-decay_rate * df['time_normalized'])
    df['time_on_page_minutes'] = df['timeOnPageHistory'] / 60000
    return df


def calculate_engagement_score(df:pd.DataFrame) -> pd.DataFrame:
    df['engagement_score'] = (
        df['numberOfClicksHistory'] * 0.4 +
        df['scrollPercentageHistory'] * 0.2 +
        df['pageVisitsCountHistory'] * 0.2 +
        df['time_on_page_minutes'] * 0.1 +
        df['time_decay_weight'] * 0.1
    )
    return df

def get_engagement_score_with_PCA(df:pd.DataFrame, interaction_features: list) -> pd.DataFrame:
    df['days_since_click'] = (df['days_since_click'] * -1)
    df['time_decay_weight'] = (df['time_decay_weight'] * -1)
    scaler = StandardScaler()
    X = df[interaction_features]
    scaled_X = scaler.fit_transform(X)
    pca = PCA(n_components=1)
    pca_result = pca.fit_transform(scaled_X)
    df['engagement_score_pca'] = pca_result
    return df


#### Pipeline

In [22]:
## User Pipeline Variables
split_columns = [
    'history', 
    'timestampHistory', 
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory', 
    'timestampHistory_new'
]

columns_to_drop = [
    'userType', 
    'historySize', 
    'timestampHistory_new', 
    'timestampHistory', 
    'timeOnPageHistory', 
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory',
    'timestamp',
    'days_since_click',
    'day_of_week',
    'hour_of_day',
    'time_normalized',
    'time_decay_weight',
    'time_on_page_minutes'
]

interaction_features = [
    'numberOfClicksHistory', 
    'timeOnPageHistory', 
    'scrollPercentageHistory', 
    'pageVisitsCountHistory', 
    'time_on_page_minutes', 
    'time_decay_weight', 
    'days_since_click'
]

In [23]:
users_pipeline = Pipeline(
    steps=[
        ('split_multivalued_df', FunctionTransformer(
            split_multivalued_df, 
            kw_args={'split_columns': split_columns})),
        ('create_time_features', FunctionTransformer(set_time_base_features)),
        ('set_engagement_score_with_pca', FunctionTransformer(
            get_engagement_score_with_PCA, 
            kw_args={'interaction_features': interaction_features})),
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': columns_to_drop}))
    ]
)

#### Pipeline Run

##### Test Run

In [24]:
df_test_pipeline = treino.head()
users_df = users_pipeline.transform(df_test_pipeline)
users_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
  df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')


Unnamed: 0,userId,history,engagement_score_pca
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,0.035902
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,-1.261832
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,-1.129264
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,-1.557986
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,1.295993


##### PROD Run

In [25]:
users_df = users_pipeline.transform(treino)
users_df.head()

  df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')


Unnamed: 0,userId,history,engagement_score_pca
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,0.417061
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,-0.344065
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,-0.162837
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,-0.835794
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,2.108754
