Notebook Created to test the data pipeline

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


For performance reasons you may prefer to download the parquet and add in your local folder to work.

If you prefer doing this, please create a folder called local inside 'challenge_files'

In [126]:
# itens = pd.read_parquet('local/itens.parquet')
# # teste = pd.read_parquet('local/teste.parquet')
# treino = pd.read_parquet('local/treino.parquet')
# # validacao = pd.read_parquet('local/validacao.parquet')
# # validacao_k = pd.read_parquet('local/validacao_k.parquet')

data = pd.read_parquet('local/user_colab_filter.parquet')



In [127]:
data.head()

Unnamed: 0,userId,history,engagement_score_pca,engagement_score
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,0.417061,40.793976
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,-0.344065,18.971316
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,-0.162837,19.851072
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,-0.835794,7.980092
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,2.108754,41.582825


### Feature Store Functions

In [4]:

# Add the below functions to python (maybe utils?)

def split_multivalued_df(df: pd.DataFrame, split_columns: list) -> pd.DataFrame:
    df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
    expanded_df = df.explode(split_columns, ignore_index=True)
    return expanded_df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df


def merge_dfs(
        df_a: pd.DataFrame,
        df_b: pd.DataFrame,
        df_a_key: str,
        df_b_key: str) -> pd.DataFrame:
    merged_df = pd.merge(df_a, df_b, left_on=df_a_key, right_on=df_b_key, how='inner')
    return merged_df


def adjust_date(df:pd.DataFrame, column_to_format:str):
    df[column_to_format] = pd.to_datetime(df[column_to_format]).dt.tz_localize(None)
    df['date_reference'] = pd.to_datetime(df[column_to_format], unit='ms')
    return df.sort_values(by=column_to_format)


def adjust_number_columns(df:pd.DataFrame, columns:List[str]):
    for column in columns:
        df[column] = pd.to_numeric(df[column])
    return df


def set_time_base_features(df:pd.DataFrame) -> pd.DataFrame:
    decay_rate = 0.0001
    df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')
    max_date = df['timestamp'].max()
    df['timeOnPageHistory'] = pd.to_numeric(df['timeOnPageHistory'])
    df['days_since_click'] = (max_date - df['timestamp']).dt.days
    df['day_of_week'] = df['timestamp'].dt.day_name()
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['time_normalized'] = df['days_since_click'] / df['days_since_click'].max()
    df['time_decay_weight'] = np.exp(-decay_rate * df['time_normalized'])
    df['time_on_page_minutes'] = df['timeOnPageHistory'] / 60000
    return df

def create_crud_categories(df:pd.DataFrame) -> pd.DataFrame:
    def extract_substring(url):
        try:
            start_index = url.index(".com/") + len(".com/")
            end_index = url.index("/20")
            return url[start_index:end_index]
        except ValueError:
            return ""  # Handle cases where ".com/" or "/20" is not found
    
    df['crud_categories'] = df['url'].apply(extract_substring)
    return df


# Function split crud categories
def split_crud_categories(df:pd.DataFrame) -> pd.DataFrame:
    df['grouped_categories'] = df['crud_categories'].str.split('/')
    return df


def extract_categories(df):
    # Add 'category' and 'sub_category' columns
    df['grouped_categories'] = df['grouped_categories'].to_list()
    df["category"] = df["grouped_categories"].apply(lambda x: x[-2] if len(x) > 1 else (x[0] if x else None))  # Second-to-last element
    df["sub_category"] = df["grouped_categories"].apply(lambda x: x[-1] if x else None)  # Last element
    return df


def calculate_engagement_score(df:pd.DataFrame) -> pd.DataFrame:
    df['engagement_score'] = (
        df['numberOfClicksHistory'] * 0.4 +
        df['scrollPercentageHistory'] * 0.2 +
        df['pageVisitsCountHistory'] * 0.2 +
        df['time_on_page_minutes'] * 0.1 +
        df['time_decay_weight'] * 0.1
    )
    return df


### Pipelines

**TO-DO:**

*We need to review how to add the below pipelines in our project*

In [1]:
## User Pipeline Variables
split_columns = ['history', 'timestampHistory', 'numberOfClicksHistory', 'timeOnPageHistory', 'scrollPercentageHistory', 'pageVisitsCountHistory']
columns_to_drop = ['userType', 'historySize', 'timestampHistory_new', 'timestampHistory', 'timeOnPageHistory']

#### Data Pipeline

In [6]:
users_pipeline = Pipeline(
    steps=[
        ('split_multivalued_df', FunctionTransformer(split_multivalued_df, kw_args={'split_columns': split_columns})),
        ('create_time_features', FunctionTransformer(set_time_base_features)),
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': columns_to_drop}))
    ]
)


#### Tests

In [9]:
df_test_pipeline = itens.head()

#### Pipeline Runs

In [6]:
users_df = users_pipeline.transform(treino)
users_df.head()

  df['timestamp'] = pd.to_datetime(df['timestampHistory'], unit='ms')


Unnamed: 0,userId,history,numberOfClicksHistory,scrollPercentageHistory,pageVisitsCountHistory,timestamp,days_since_click,day_of_week,hour_of_day,time_normalized,time_decay_weight,time_on_page_minutes
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,76,50.3,2,2022-07-06 22:26:57.045,177,Wednesday,22,0.972527,0.999903,0.339667
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,38,18.18,1,2022-07-06 22:30:05.778,177,Wednesday,22,0.972527,0.999903,0.353067
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,41,16.46,1,2022-07-06 22:31:38.738,177,Wednesday,22,0.972527,0.999903,0.590633
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,7,25.35,1,2022-07-01 14:04:00.278,182,Friday,14,1.0,0.9999,0.100817
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,80,45.66,1,2022-07-02 11:27:46.729,181,Saturday,11,0.994505,0.999901,3.50815


#### ML Pipeline

In [10]:
df_a_key = 'history'
df_b_key = 'page'
als_drop_columns = ['history',
                    'numberOfClicksHistory',
                    'scrollPercentageHistory',
                    'pageVisitsCountHistory',
                    'time_on_page_minutes',
                    'time_decay_weight',
                    'timestamp',
                    'days_since_click',
                    'day_of_week',
                    'hour_of_day',
                    'time_normalized',
                    'time_decay_weight',
                    'time_on_page_minutes',
                    'issued',
                    'cleaned_title',
                    'embbed_title'
                    ]

fields_to_format = ['numberOfClicksHistory', 
                    'scrollPercentageHistory', 
                    'pageVisitsCountHistory', 
                    'time_on_page_minutes', 
                    'time_decay_weight']

In [11]:
recommender_pipeline = Pipeline(
    steps=[
        ('merge', FunctionTransformer(
            merge_dfs, kw_args={'df_b': itens_df, 'df_a_key': df_a_key, 'df_b_key': df_b_key})),
        ('convert_fields_to_numeric', FunctionTransformer(adjust_number_columns, kw_args={'columns': fields_to_format})),
        ('calculate_engagement_score', FunctionTransformer(calculate_engagement_score)),   
        ('drop_unused_columns', FunctionTransformer(
            drop_columns, kw_args={'columns_to_drop': als_drop_columns}
        ))
    ]
)

In [12]:
als_df = recommender_pipeline.transform(users_df)
als_df.head()

Unnamed: 0,userId,page,category,sub_category,engagement_score
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,santa-catarina,noticia,40.993957
1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,itapetininga-regiao,noticia,8.180072
2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,04756569-593e-4133-a95a-83d35d43dbbd,minas-gerais,noticia,14.33478
3,c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...,1f2b9c2f-a2d2-4192-b009-09065da8ec23,rio-de-janeiro,noticia,15.456484
4,e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...,bebdeb3e-1699-43e0-a1b8-989f5a6ab679,economia,noticia,250.183651


In [51]:
als_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577942 entries, 0 to 577941
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   userId            577942 non-null  object 
 1   page              577942 non-null  object 
 2   category          577942 non-null  object 
 3   sub_category      577942 non-null  object 
 4   engagement_score  577942 non-null  float64
dtypes: float64(1), object(4)
memory usage: 22.0+ MB


## Collaborative Filtering

### KNN Test

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

In [6]:
knn_df = users_df.copy()
knn_drop = ['timestampHistory', 'numberOfClicksHistory','timeOnPageHistory','pageVisitsCountHistory', 'page', 'url', 'modified', 'body', 'caption']
date_to_format = 'issued'
number_to_format = 'scrollPercentageHistory'
knn_df = drop_columns(knn_df, knn_drop)
knn_df = adjust_date(knn_df, date_to_format)
knn_df = adjust_number_columns(knn_df, number_to_format)
knn_df.head()

Unnamed: 0,userId,history,scrollPercentageHistory,issued,title
538986,5bbe5d34f92fbbc8854cd5468ecc0db28a25e54388bec3...,esid:conteudo_editorial_g1#materia#https://esp...,24.48,2015-11-17 08:04:41,Conexões da Lava Jato
29796,d81b7e5879cbf74a447aa1e20f584d4eba30604ede2365...,esid:conteudo_editorial_g1#materia#http://espe...,37.38,2015-12-01 23:04:41,Calculadora de Combustível
341736,335fe8c2c4282d3ca8c33f01b412374f1d786b2703b385...,esid:conteudo_editorial_g1#materia#http://espe...,82.23,2015-12-01 23:04:41,Calculadora de Combustível
208894,9225d6cdd374256d25ee877b6c410b82446a73866074b8...,f1640c36-2c34-4cf4-bd5e-bdc73363b15d,29.32,2015-12-04 18:17:03,"Australiano flagra lagarto de 1,5 m escalando ..."
378396,55ef18f56791794a0e52eb6e4db69e62be29fa2884288e...,f1640c36-2c34-4cf4-bd5e-bdc73363b15d,54.27,2015-12-04 18:17:03,"Australiano flagra lagarto de 1,5 m escalando ..."


In [7]:
# Create a sparse matrix
row = knn_df['userId'].astype('category').cat.codes
col = knn_df['title'].astype('category').cat.codes
data = knn_df['scrollPercentageHistory'].astype('float32')
sparse_matrix = csr_matrix((data, (row, col)))
print(sparse_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 577942 stored elements and shape (577942, 78489)>
  Coords	Values
  (0, 4162)	46.22999954223633
  (1, 1130)	15.59000015258789
  (2, 33749)	10.479999542236328
  (3, 73553)	73.83000183105469
  (4, 3317)	23.459999084472656
  (5, 4298)	56.08000183105469
  (6, 75419)	61.41999816894531
  (7, 10454)	25.239999771118164
  (8, 15740)	53.369998931884766
  (9, 43863)	57.04999923706055
  (10, 16069)	26.459999084472656
  (11, 26714)	22.219999313354492
  (12, 47315)	33.209999084472656
  (13, 66299)	46.58000183105469
  (14, 53084)	40.630001068115234
  (15, 18313)	35.369998931884766
  (16, 77601)	38.41999816894531
  (17, 63154)	45.709999084472656
  (18, 13121)	32.45000076293945
  (19, 5853)	42.25
  (20, 32988)	16.8799991607666
  (21, 60373)	25.229999542236328
  (22, 56738)	16.469999313354492
  (23, 51319)	57.310001373291016
  (24, 18074)	13.829999923706055
  :	:
  (577917, 69496)	23.920000076293945
  (577918, 61434)	24.139999389648438
  (577

In [9]:
# Remove rows with no interactions
sparse_matrix = sparse_matrix[sparse_matrix.getnnz(axis=1) > 0]

# Normalize the sparse matrix row-wise
sparse_matrix_normalized = normalize(sparse_matrix, norm='l2', axis=1)

In [10]:
# Train a KNN model
model = NearestNeighbors(metric='cosine')
model.fit(sparse_matrix_normalized)

In [11]:
# Find neighbors for a user (e.g., the first user in the matrix)
distances, indices = model.kneighbors(sparse_matrix[0], n_neighbors=3000)

print("Distances:", distances)
print("Indices of neighbors:", indices)

Distances: [[0. 0. 0. ... 1. 1. 1.]]
Indices of neighbors: [[  5948   4616   3962 ... 577920 577905 577913]]


In [12]:
# Filter indices where distances are neither 0 nor 1
filtered_indices = [indices[0][i] for i in range(len(distances[0])) if distances[0][i] != 0 and distances[0][i] != 1]

print("Filtered User IDs (distances not 0 or 1):", filtered_indices)

Filtered User IDs (distances not 0 or 1): []


In [13]:
# Get the mapping from matrix index to userId
user_id_mapping = pd.Series(new_df['userId'].astype('category').cat.categories)

# Get the userId for index 500 and 577915
user_id_500 = user_id_mapping.iloc[500]
user_id_91104 = user_id_mapping.iloc[91104]

print("User ID for index 500:", user_id_500)
print("User ID for index 91104:", user_id_91104)


User ID for index 500: 003a7e970c4fa0a05483723b3139a68b86367eef3d03154fedb610119d41817b
User ID for index 91104: 28607d5a04b09f40921c1d6ee4e14e276f462a086a2c1e3b1a3e059daa16f403


In [14]:
# Get news consumed by each user
news_500 = knn_df[knn_df['userId'] == user_id_500]
news_91104 = knn_df[knn_df['userId'] == user_id_91104]

print("News consumed by User 500:\n", news_500[['title', 'scrollPercentageHistory']])
print("News consumed by User 91104:\n", news_91104[['title', 'scrollPercentageHistory']])


News consumed by User 500:
                                                     title  \
339684  Após Covid-19, professora de educação física l...   

       scrollPercentageHistory  
339684                   59.18  
News consumed by User 91104:
                                                     title  \
336454  Após Covid-19, professora de educação física l...   

       scrollPercentageHistory  
336454                   24.04  


In [15]:
# Find common news titles
common_titles = set(news_500['title']).intersection(set(news_91104['title']))

# Compare scroll percentage for common news
comparison = news_500[news_500['title'].isin(common_titles)].merge(
    news_91104[news_91104['title'].isin(common_titles)],
    on='title',
    suffixes=('_500', '_577915')
)

print("Comparison of common news:\n", comparison[['title', 'scrollPercentageHistory_500', 'scrollPercentageHistory_577915']])


Comparison of common news:
                                                title  \
0  Após Covid-19, professora de educação física l...   

  scrollPercentageHistory_500 scrollPercentageHistory_577915  
0                       59.18                          24.04  


In [16]:
# Get news consumed by each user
news_500 = knn_df[knn_df['userId'] == user_id_500]
news_91104 = knn_df[knn_df['userId'] == user_id_91104]

# Get the titles consumed by both users
titles_500 = set(news_500['title'])
titles_91104 = set(news_91104['title'])

print("Titles consumed by User 500:", titles_500)
print("Titles consumed by User 91104:", titles_91104)


Titles consumed by User 500: {'Após Covid-19, professora de educação física luta para recuperar movimento de perna:  Como se não tivesse esta parte '}
Titles consumed by User 91104: {'Após Covid-19, professora de educação física luta para recuperar movimento de perna:  Como se não tivesse esta parte '}


In [17]:
# Find titles consumed by user 577915 but not user 500
unique_titles_91104 = titles_91104 - titles_500

# Get the news rows for these unique titles
unique_titles_91104 = news_91104[news_91104['title'].isin(unique_titles_91104)]

print("News consumed by User 577915 but not User 500:\n", unique_titles_91104[['title', 'scrollPercentageHistory']])


News consumed by User 577915 but not User 500:
 Empty DataFrame
Columns: [title, scrollPercentageHistory]
Index: []


### ALS Test

#### Recommender by Category

In [13]:
category_data = als_df.groupby(['userId', 'category'])['engagement_score'].sum().reset_index()
category_data.head()

Unnamed: 0,userId,category,engagement_score
0,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,bahia,15.563662
1,00004868f064a8147619ca4d75eac9ccabfbe1169840e6...,santa-catarina,3.534658
2,00007a4e5949a3dba7c977503c53e0873643fe17d0802a...,,2.84599
3,000087b05ccb95dec5d55e968764285c5403747fc35da2...,turismo-e-viagem,15.564465
4,00011b1ced626112372206634e0e9b5ccb432da916e83f...,,5.015443


In [14]:
print(category_data['userId'][50])

0007ebffed6c21f1e0ddaafaa9555facd14eeeda57a6e3ffdde8fe35e0fa7ece


In [15]:
# Create mappings for user and category IDs
user_mapping = {user: i for i, user in enumerate(category_data['userId'].unique())}
category_mapping = {cat: i for i, cat in enumerate(category_data['category'].unique())}

# Map IDs
category_data['user_id_mapped'] = category_data['userId'].map(user_mapping)
category_data['category_id_mapped'] = category_data['category'].map(category_mapping)

# Create sparse matrix
user_category_matrix = csr_matrix((
    category_data['engagement_score'],
    (category_data['user_id_mapped'], category_data['category_id_mapped'])
))

In [16]:
category_model = AlternatingLeastSquares(factors=10, regularization=0.1, iterations=20)
category_model.fit(user_category_matrix.T)  # Transpose for the `implicit` library

  check_blas_config()
100%|██████████| 20/20 [00:09<00:00,  2.04it/s]


In [17]:
user_index = user_mapping['0007ebffed6c21f1e0ddaafaa9555facd14eeeda57a6e3ffdde8fe35e0fa7ece']
#user_id = 1  # Example user
recommendations = category_model.recommend(user_index, user_category_matrix[user_index], N=5)


In [18]:
for category_id, score in zip(recommendations[0], recommendations[1]):
    category_name = category_mapping.get(category_id, "Unknown Category")  # Get category name or fallback to "Unknown"
    print(f"Category: {category_id}")
    print(f"Category Name: {category_data['category'][category_id]}")
    print(f"Score: {score}")

Category: 16914
Category Name: vacina
Score: 2.548360824584961
Category: 551677
Category Name: caminhos-do-campo
Score: 1.9821887016296387
Category: 228013
Category Name: vacinas
Score: 1.915966510772705
Category: 403641
Category Name: transito
Score: 1.850183367729187
Category: 138932
Category Name: vacinas
Score: 1.6494094133377075


#### Recommender by Sub-Category

In [19]:
subcategory_data = als_df.groupby(['userId', 'sub_category'])['engagement_score'].sum().reset_index()

In [20]:
# Create mappings for user and category IDs
user_mapping = {user: i for i, user in enumerate(subcategory_data['userId'].unique())}
subcategory_mapping = {cat: i for i, cat in enumerate(subcategory_data['sub_category'].unique())}

# Map IDs
subcategory_data['user_id_mapped'] = subcategory_data['userId'].map(user_mapping)
subcategory_data['subcategory_id_mapped'] = subcategory_data['sub_category'].map(subcategory_mapping)

# Create sparse matrix
user_subcategory_matrix = csr_matrix((
    subcategory_data['engagement_score'],
    (subcategory_data['user_id_mapped'], subcategory_data['subcategory_id_mapped'])
))

In [21]:
subcategory_model = AlternatingLeastSquares(factors=10, regularization=0.1, iterations=20)
subcategory_model.fit(user_category_matrix.T)  # Transpose for the `implicit` library

100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


In [22]:
user_index = user_mapping['0007ebffed6c21f1e0ddaafaa9555facd14eeeda57a6e3ffdde8fe35e0fa7ece']
#user_id = 1  # Example user
sub_category_recommendations = subcategory_model.recommend(user_index, user_subcategory_matrix[user_index], N=5)

In [23]:
for subcategory_id, score in zip(sub_category_recommendations[0], sub_category_recommendations[1]):
    # sub_category_name = category_mapping.get(category_id, "Unknown Category")  # Get category name or fallback to "Unknown"
    print(f"Category: {subcategory_id}")
    print(f"Category Name: {subcategory_data['sub_category'][subcategory_id]}")
    print(f"Score: {score}")

Category: 228013
Category Name: noticia
Score: 2.0174736976623535
Category: 138932
Category Name: noticia
Score: 1.7368409633636475
Category: 439291
Category Name: post
Score: 1.6716045141220093
Category: 554196
Category Name: noticia
Score: 1.548399806022644
Category: 481136
Category Name: noticia
Score: 1.5142359733581543


#### News Recommender

In [143]:
data_smaller = data.head(200000)
train, test = train_test_split(data, test_size=0.01)
print(f'Train lengh = {len(train)}')
print(f'Test lengh = {len(test)}')

Train lengh = 8042711
Test lengh = 81240


In [144]:
user_map = {u: i for i, u in enumerate(data['userId'])}
item_map = {p: i for i, p in enumerate(data['history'])}
user_index_to_id = {v: k for k, v in user_map.items()}
item_index_to_id = {v: k for k, v in item_map.items()} 

##### Train

In [145]:
train['userId_idx'] = train['userId'].map(user_map)
train['history_idx'] = train['history'].map(item_map)
train_matrix = coo_matrix((
    train['engagement_score'],
    (train['userId_idx'], train['history_idx']),
)).tocsr()

##### Test

In [146]:
test['userId_idx'] = test['userId'].map(user_map)
test['history_idx'] = test['history'].map(item_map)
test.head()


Unnamed: 0,userId,history,engagement_score_pca,engagement_score,userId_idx,history_idx
5235276,0ad8012f64119ac212d11161efde953469473e11787f5e...,e650aae9-2b05-4564-9ce7-108a31c4de20,-0.985446,3.412672,5235681,8118172
4728645,d3d60a43a97d39c62433030bf81591a1333b88e2cac3dd...,945fe586-7082-4cb4-a915-c1023f56f7ae,-0.657178,3.348673,4728664,8119992
2206604,9414e77e9d59bb6ed6fe2b2760d1d5b49b5bd76b8da299...,d2593c3d-2347-40d9-948c-b6065e8459a9,-0.028997,32.061789,2206608,8123740
8047802,fe56cad55f42166e46b27d5a03de2a8847786920b74703...,e3ccb6c1-0a55-4cd3-b818-f98be256f193,3.339475,38.440877,8047811,8121653
6004793,306ef2b027e436c1f798f324e407296f1300e0eef0144b...,09c77143-7858-4d15-9dcd-b1c4b006024b,-0.872235,7.515337,6004793,7945677


In [122]:
users = test['userId'].unique()
users

array(['0985af1b52a6985dea793566f245818797fd801daf1e654f9797c2e94fb2a5c2',
       'b1249bbcde58b1758031e3a096d11be56a3b102ec88803d572e125b8364e0f4d',
       'fa2c28541390c4c0577b0e64cb98612299cc355011d24f864128f3691ab81807',
       ...,
       '978761548b120dede09209a8ed81fc68007118bda3006916fd991976731c1e40',
       'fd566fdc9dea6e4c5a5465c588dec68937a97d39ca052ea280b68b8b7f909b80',
       '9533dc04babb33e598fe1b6aaa35c9c9ceda25e71213a8199a04d424c14a7228'],
      dtype=object)

##### Model

In [140]:
news_model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=20)
news_model.fit(train_matrix)

100%|██████████| 20/20 [01:17<00:00,  3.86s/it]


In [70]:
# lembrar de olhar pois a logica parece fazer sentido
# test_interactions = test_matrix[62].toarray().flatten()
# relevant_items = set(np.where(test_interactions > 0)[0])
# relevant_items

{np.int64(34), np.int64(42), np.int64(98725)}

In [44]:
user_index = user_map['cd7c50c6019a830dfc4e083cdad118694cf22483a4a67c02c49d5ae7968796d5']
news_recommendations = news_model.recommend(user_index, train_matrix[user_index], N=5)

In [45]:
print(type(news_recommendations[0]))
#print(news_recommendations)
pred = [user_index, news_recommendations]
print(pred)

<class 'numpy.ndarray'>
[79763, (array([99869, 99614, 99820, 99647, 99496], dtype=int32), array([0.02885162, 0.01433471, 0.01398447, 0.01290909, 0.01269918],
      dtype=float32))]


In [46]:
print(news_recommendations[0])

[99869 99614 99820 99647 99496]


In [47]:
# user_news = (test[(test['userId_idx'] == user_index) & (test['engagement_score_pca'] > 0)]['history_idx']).to_numpy()
top_k_user_news = test[(test['userId_idx'] == user_index) & (test['engagement_score_pca'] > 0)].nlargest(5, 'engagement_score_pca')['history_idx'].to_numpy()
print(top_k_user_news)

[95033 95893 79482 95932 99463]


In [48]:
precision = len(news_recommendations[0] & top_k_user_news) / 5
print(precision)

1.0


In [50]:
users_idx = test['userId_idx'].unique()
len(users_idx)

3184

In [154]:
def calculate_top_k_precision(model, test_df, top_k, train_matrix):
    users_idx = test_df['userId_idx'].unique()
    precisions = []
    for user in tqdm(users_idx, desc="Processing Test", total=len(users_idx)):
        item, score = model.recommend(user, train_matrix[user], N=top_k)
        top_k_user_news = test[
            (test['userId_idx'] == user) & (test['engagement_score'] > 0)
        ].nlargest(top_k, 'engagement_score')['history_idx'].to_numpy()
        intersection = len(set(item).intersection(set(top_k_user_news)))
        precision = intersection / top_k
        precisions.append(precision)
    return np.round(np.mean(precision), 4)

In [155]:
test_df = test.head(2000)
test_df2 = test.copy()
score = calculate_top_k_precision(news_model, test_df, 10, train_matrix)
print(score)

Processing Test: 100%|██████████| 1970/1970 [02:37<00:00, 12.48it/s]

0.1





In [26]:
# # Get userId from user index
# user_index = 0 
# userId = user_index_to_id.get(user_index, None) 
# print(f"userId for index {user_index}: {userId}")  # Output: userId for index 0: 1

# Get productId from item index
item_index = 6498567
productId = item_index_to_id.get(item_index, None) 
print(f"news for index {item_index}: {productId}")  # Output: productId for index 1: B

news for index 6498567:  1c27cf97-b20c-4e40-b1f1-288b721517b3


## Cosine Similarity

##### Data Prep

In [2]:
users_history = pd.read_parquet('local/user_clustered_recommend.parquet')
itens = pd.read_parquet('local/itens_text_db_scan.parquet')

In [4]:
print('Users History')
users_history.head(2)


Users History


Unnamed: 0,userId,history,year,week_of_year
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022,27
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022,27


In [5]:
print('Itens List')
itens.head(2)

Itens List


Unnamed: 0,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
0,13db0ab1-eea2-4603-84c4-f40a876c7400,Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",0
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,As expressões santarenas não significam apenas...,NEUTRAL,0.450133,NEUTRAL,0.68394,Linguajar dos santarenos é diferenciado e chei...,"[0.05066423, 0.053637918, -0.06246449, -0.0591...",1


In [6]:
merged_df = pd.merge(users_history, itens, how='left', left_on='history', right_on='page')
merged_df.head(2)

Unnamed: 0,userId,history,year,week_of_year,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022,27,c8aab885-433d-4e46-8066-479f40ba7fb2,Caminhoneira Aline Füchter ficou em pé em fren...,NEUTRAL,0.680773,NEGATIVE,0.767988,"Você viu? Musa das Estradas faz vídeo de pé,...","[-0.035558525, 0.0103199985, -0.017475873, -0....",0
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022,27,68d2039c-c9aa-456c-ac33-9b2e8677fba7,"Luana Rabello, segundo a polícia, é muito famo...",NEGATIVE,0.829965,NEUTRAL,0.659312,Mulher-Gato foi proibida de entrar na Maré ap...,"[-0.048941117, 0.024516182, -0.026205998, -0.0...",23


In [15]:
user = 'ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee059c6cd15025a75833'
subset_df = merged_df['embbed_title'][(merged_df['userId'] == user) & (merged_df['week_of_year'] == 32) & (merged_df['classes'] == 0)]
print(subset_df)

7258612    [0.02752195, -0.03881834, -0.07810406, -0.0629...
7258613    [0.053475667, 0.011896103, -0.07728048, 0.0010...
Name: embbed_title, dtype: object


In [7]:
grouped_train = merged_df.groupby(['userId', 'classes', 'year', 'week_of_year'])['embbed_title'].mean().reset_index()
grouped_train.head()

Unnamed: 0,userId,classes,year,week_of_year,embbed_title
0,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,27,"[0.004265833180397749, 0.09042374789714813, -0..."
1,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,28,"[-0.022484811022877693, 0.11090528219938278, -..."
2,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,30,"[-0.00840277411043644, -0.003639867063611746, ..."
3,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,32,"[0.008118255995213985, 0.06738191843032837, -0..."
4,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,11,2022,28,"[0.05941460281610489, -0.017168257385492325, -..."


In [8]:
sort = ['userId', 'week_of_year']
df_sorted = grouped_train.copy()
df_sorted.sort_values(by=sort, ascending=False, inplace=True)
df_sorted.head()

Unnamed: 0,userId,classes,year,week_of_year,embbed_title
3055849,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,0,2022,32,"[0.04049880802631378, -0.013461118564009666, -..."
3055848,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,0,2022,31,"[-0.017473912487427395, 0.06039712826410929, -..."
3055847,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,0,2022,30,"[0.036560624837875366, 0.06149597465991974, -0..."
3055846,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,0,2022,28,"[-0.011680557392537594, 0.020306464284658432, ..."
3055850,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,63,2022,28,"[-0.04010026901960373, 0.07792015373706818, -0..."


In [93]:
id = '000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951668074de9c6a0faf15'
user = df_sorted[df_sorted['userId']== id]
user

Unnamed: 0,userId,classes,year,week_of_year,embbed_title
3,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,32,"[0.008118255995213985, 0.06738191843032837, -0..."
2,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,30,"[-0.00840277411043644, -0.003639867063611746, ..."
6,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,13,2022,29,"[-0.01422516256570816, 0.029288195073604584, -..."
1,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,28,"[-0.022484811022877693, 0.11090528219938278, -..."
4,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,11,2022,28,"[0.05941460281610489, -0.017168257385492325, -..."
5,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,13,2022,28,"[0.023052755743265152, 0.029235471040010452, -..."
0,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,0,2022,27,"[0.004265833180397749, 0.09042374789714813, -0..."
7,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,19,2022,26,"[0.02536534145474434, 0.06137628108263016, -0...."


In [None]:
embeddings_array = np.array(subset_df.tolist())
mean_embedding = np.mean(embeddings_array, axis=0)
print(mean_embedding)

In [None]:
embbeding_group_by = df_sorted['embbed_title'][3]
print(embbeding_group_by == mean_embedding)
news_data = itens[['page', 'embbed_title']][itens['classes']==0]

In [96]:
news_map = {u: i for i, u in enumerate(news_data['page'])}
news_index_to_page = {v: k for k, v in news_map.items()}
news_data['news_idx'] = news_data['page'].map(news_map)
news_data['embbed_title'] = news_data['embbed_title'].to_numpy()
news_data.head()

Unnamed: 0,page,embbed_title,news_idx
0,13db0ab1-eea2-4603-84c4-f40a876c7400,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",0
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,"[-0.02016749, 0.13216536, 0.03784579, -0.01504...",1
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"[0.031508345, 0.10205304, 0.03989872, 0.017708...",2
5,a9fd6d34-6f40-4c90-849b-2ad36f04fd6f,"[0.020607894, 0.09861154, 0.016439063, 0.03443...",3
7,682da2fa-6f5b-4017-be35-7968990f62b9,"[-0.022007013, 0.077335976, -0.005180943, -0.0...",4


In [97]:
new_cosine_df = news_data[['news_idx', 'embbed_title']]
new_cosine_df['embbed_title'] = new_cosine_df['embbed_title'].to_numpy()
new_cosine_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_cosine_df['embbed_title'] = new_cosine_df['embbed_title'].to_numpy()


Unnamed: 0,news_idx,embbed_title
0,0,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."
3,1,"[-0.02016749, 0.13216536, 0.03784579, -0.01504..."
4,2,"[0.031508345, 0.10205304, 0.03989872, 0.017708..."
5,3,"[0.020607894, 0.09861154, 0.016439063, 0.03443..."
7,4,"[-0.022007013, 0.077335976, -0.005180943, -0.0..."


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_with_mean_embeddings(user_id, user_mean_embeddings, news_dataset, top_n=10):
    # Step 1: Get the user's mean embedding
    user_embedding = user_mean_embeddings[user_mean_embeddings["userid"] == user_id]["mean_embedding"].values[0]

    # Step 2: Compute similarity with all articles
    news_dataset["similarity"] = cosine_similarity(
        np.vstack(news_dataset["embedding"]),  # All news embeddings
        user_embedding.reshape(1, -1)         # User's mean embedding
    ).flatten()

    # Step 3: Sort articles by similarity
    recommended_articles = news_dataset.sort_values(by="similarity", ascending=False)

    # Step 4: Return top N recommendations
    return recommended_articles.head(top_n)


In [86]:
def recomend_for_user(user_mean_embedding, news_dataset, top_n=20):
    user_mean_embedding = np.array(user_mean_embedding).reshape(1, -1)

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(
        np.vstack(news_dataset["embbed_title"]), 
        user_mean_embedding
    ).flatten()

    # Create a new column in the DataFrame
    news_dataset['cosine_similarity'] = cosine_similarities

    recommended_articles = news_dataset.sort_values(by="cosine_similarity", ascending=False)
    return recommended_articles.head(top_n)

In [106]:
user = 'd0afad7ea843d86597d822f0df1d39d31a3fea7c39fdeee870d49b897e1e99cd'
mean_embedding = df_sorted['embbed_title'][df_sorted['userId']==user].to_numpy()[0]
recommendations = recomend_for_user(user_mean_embedding=mean_embedding, news_dataset=new_cosine_df)

In [107]:
recommendations.head(10)

Unnamed: 0,news_idx,embbed_title,cosine_similarity
215100,150611,"[-0.0029798776, 0.04611043, -0.012109296, 0.00...",0.612313
225278,157829,"[0.009820029, 0.054551758, -0.059913233, 0.060...",0.599516
65736,46013,"[-0.0270126, 0.03152837, -0.09005802, -0.05341...",0.598899
28442,19836,"[0.07163248, 0.07477959, -0.056716375, 0.02677...",0.59242
27230,18983,"[0.023007842, 0.04212488, -0.005578505, 0.0591...",0.589199
9509,6584,"[0.017490722, 0.085587874, -0.0069175386, 0.02...",0.585967
25541,17820,"[-0.021641925, 0.07678638, 0.02513109, 0.04916...",0.582391
115162,80696,"[0.06261058, 0.1349357, 0.01617485, 0.00537230...",0.581189
4390,3037,"[0.05541889, 0.05236211, -0.014302052, 0.06342...",0.580517
37514,26225,"[0.04486997, 0.0019674639, -0.035819642, 0.004...",0.580397


In [105]:
recommendations['news_page'] = recommendations['news_idx'].map(news_index_to_page)
recommendations.head(20)

Unnamed: 0,news_idx,embbed_title,cosine_similarity,news_page
232249,162740,"[0.008118256, 0.06738192, -0.016155554, -0.055...",1.0,6be11f1b-fbcb-4bdc-9415-a52721d191fc
49012,34328,"[-0.002906414, 0.045118153, -0.0641471, 0.0063...",0.796211,ecb1ee44-43a5-46da-8611-1f6735a00bc8
48435,33928,"[0.0061627463, 0.010136838, -0.038152494, -0.0...",0.762711,dec1bfbc-aad9-439f-b5e0-e1949d531d14
54992,38466,"[-0.016498705, 0.009403606, 0.0048239175, -0.0...",0.752613,7fb38dc7-cc5b-4c08-a9bc-3a0b5765491b
207741,145537,"[0.059679277, 0.024589594, -0.02330569, -0.054...",0.747368,8255460c-90d6-489a-8055-5728faebe034
210795,147634,"[0.016739227, 0.02350609, -0.022927972, -0.034...",0.744175,7b8a4657-f7d2-44e6-9113-b499f023a5a5
101651,71249,"[0.012881902, 0.040906116, -0.067896694, -0.09...",0.738863,e81a190c-4cac-48c3-9f1b-c8c852962d2e
188736,132164,"[0.037167452, 0.021260876, -0.026327806, -0.00...",0.737503,13a79756-74e6-4a0f-83e6-81edb8b26740
243613,170639,"[-0.037924685, 0.07724636, -0.050583534, -0.05...",0.734887,a539b5a7-7e76-4475-882a-531e4695babf
164194,115019,"[0.041703068, 0.06591424, -0.026409764, 0.0070...",0.731658,c53a27a0-21b8-40a8-9a84-ca3d5b70261f


##### Test 2

In [2]:
users_history = pd.read_parquet('local/user_full_features.parquet')
itens = pd.read_parquet('local/itens_text_db_scan.parquet')

In [3]:
users_history.head(1)

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new,...,days_since_click,day_of_week,hour_of_day,year,week_of_year,time_normalized,time_decay_weight,time_on_page_minutes,engagement_score_pca,engagement_score
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,1657146417045,...,-39,Wednesday,22,2022,27,0.886364,-0.999911,0.339667,0.417061,40.793976


In [4]:
itens.head(1)

Unnamed: 0,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
0,13db0ab1-eea2-4603-84c4-f40a876c7400,Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",0


In [5]:
user_df = users_history[['userId', 'history', 'date']]
user_df.head()

Unnamed: 0,userId,history,date
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022-07-06
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022-07-06
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,2022-07-06
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,2022-07-01
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,2022-07-02


In [6]:
news_df = itens[['page', 'embbed_title']]
news_df.head()

Unnamed: 0,page,embbed_title
0,13db0ab1-eea2-4603-84c4-f40a876c7400,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,"[0.05066423, 0.053637918, -0.06246449, -0.0591..."
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,"[-0.04080393, 0.08898491, -0.041833814, -0.010..."
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,"[-0.02016749, 0.13216536, 0.03784579, -0.01504..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"[0.031508345, 0.10205304, 0.03989872, 0.017708..."


In [7]:
user_news_df = pd.merge(user_df, news_df, how='left', left_on='history', right_on='page')
user_news_df.head(2)

Unnamed: 0,userId,history,date,page,embbed_title
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022-07-06,c8aab885-433d-4e46-8066-479f40ba7fb2,"[-0.035558525, 0.0103199985, -0.017475873, -0...."
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022-07-06,68d2039c-c9aa-456c-ac33-9b2e8677fba7,"[-0.048941117, 0.024516182, -0.026205998, -0.0..."


In [9]:
user_news_group_df = user_news_df.groupby(['userId', 'date'])['embbed_title'].mean().reset_index()
user_news_group_df.head()

Unnamed: 0,userId,date,embbed_title
0,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,2022-07-02,"[0.02536534145474434, 0.06137628108263016, -0...."
1,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,2022-07-05,"[0.004265833180397749, 0.09042374789714813, -0..."
2,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,2022-07-11,"[0.023052755743265152, 0.029235471040010452, -..."
3,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,2022-07-14,"[0.05941460281610489, -0.017168257385492325, -..."
4,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,2022-07-15,"[-0.022484811022877693, 0.11090528219938278, -..."


In [10]:
sort = ['userId', 'date']
user_news_group_df.sort_values(by=sort, ascending=False, inplace=True, ignore_index=True)
user_news_group_df.head()

Unnamed: 0,userId,date,embbed_title
0,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-08,"[0.04049880802631378, -0.013461118564009666, -..."
1,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-04,"[-0.03390497714281082, 0.07311740517616272, -0..."
2,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-02,"[0.010935629718005657, 0.04613644629716873, -0..."
3,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-01,"[-0.02945239096879959, 0.06193753704428673, -0..."
4,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-07-28,"[0.018855249509215355, 0.09517772495746613, 0...."


In [11]:
teste = pd.read_parquet('local/teste.parquet')
teste.head()

Unnamed: 0,userId,acessos_futuros
0,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,e67c8cdf-3c55-4399-a864-3c1591225296
1,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,68ed45c8-71b7-4b88-bcde-1695d741aa42
2,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,6f81f339-dc75-4cb1-b12f-d20a344ae64d
3,1505326617b9465f6e13eb1d0d9782bff2af61822a7bc7...,61e07f64-cddf-46f2-b50c-ea0a39c22050
4,9ade38ffe62f55863100f505b9b9be170f7b50c36ca6b5...,esid:conteudo_editorial_g1#materia#https://esp...


In [12]:
print(teste['userId'][0])

3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6ab4278280fdc1e7b9ad


In [14]:
# user = df_sorted[df_sorted['userId']== id]
user = '3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6ab4278280fdc1e7b9ad'
# user_news_group_df.head()
user_test_df = user_news_group_df[user_news_group_df['userId']== user]
user_test_df.head()

Unnamed: 0,userId,date,embbed_title
2206111,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,2022-07-22,"[0.04608006775379181, 0.11893259733915329, -0...."
2206112,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,2022-07-05,"[-0.08211548626422882, 0.045601118355989456, -..."
2206113,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,2022-07-01,"[-0.030455540865659714, 0.024764083325862885, ..."


In [15]:
# user = df_sorted[df_sorted['userId']== id]
#e67c8cdf-3c55-4399-a864-3c1591225296
news = 'e67c8cdf-3c55-4399-a864-3c1591225296'
test_search = itens[itens['page']== news]
test_search.head()

Unnamed: 0,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,classes
158780,e67c8cdf-3c55-4399-a864-3c1591225296,Decreto com nomeação foi publicado nesta quint...,NEUTRAL,0.549095,NEGATIVE,0.847603,Bolsonaro nomeia para juiz do TRT no Piauí adv...,"[-0.03045554, 0.024764083, -0.021136409, -0.05...",0


### Recommendation System Tests

* Suggestion of workflow: https://chatgpt.com/c/67794a16-1e20-8008-bae7-2751a94583d7

* ALS: https://sophwats.github.io/2018-04-05-gentle-als.html