Notebook Created to test the data pipeline

In [1]:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import csr_matrix

For performance reasons you may prefer to download the parquet and add in your local folder to work.

If you prefer doing this, please create a folder called local inside 'challenfe_files'

In [2]:
itens = pd.read_parquet('local/itens.parquet')
# teste = pd.read_parquet('local/teste.parquet')
treino = pd.read_parquet('local/treino.parquet')
# validacao = pd.read_parquet('local/validacao.parquet')
# validacao_k = pd.read_parquet('local/validacao_k.parquet')

In [3]:

# Add the below functions to python (maybe utils?)

def split_multivalued_df(df: pd.DataFrame, split_columns: list) -> pd.DataFrame:
    df[split_columns] = df[split_columns].apply(lambda col: col.str.split(','))
    expanded_df = df.explode(split_columns, ignore_index=True)
    return expanded_df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df


def merge_dfs(
        df_a: pd.DataFrame,
        df_b: pd.DataFrame,
        df_a_key: str,
        df_b_key: str) -> pd.DataFrame:
    merged_df = pd.merge(df_a, df_b, left_on=df_a_key, right_on=df_b_key, how='inner')
    return merged_df


def adjust_date(df:pd.DataFrame, column_to_format:str):
    df[column_to_format] = pd.to_datetime(df[column_to_format]).dt.tz_localize(None)
    return df.sort_values(by=column_to_format)


def adjust_number_columns(df:pd.DataFrame, column_to_format:str):
    df[column_to_format] = pd.to_numeric(df[column_to_format])
    return df

Notes:

    1 - I have confirmed that timestampHistory_new and timestampHistory have the same value always so no reason to keep timestampHistory_new 

Pipeline Test

**TO-DO:**

*We need to review how to add the below pipeline in our project*

In [4]:
# pipeline kwargs
split_columns = ['history', 'timestampHistory', 'numberOfClicksHistory', 'timeOnPageHistory', 'scrollPercentageHistory', 'pageVisitsCountHistory']
columns_to_drop = ['userType', 'historySize', 'timestampHistory_new']
df_a_key = 'history'
df_b_key = 'page'

pipeline = Pipeline(
    steps=[
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': columns_to_drop})),
        ('split_multivalued_df', FunctionTransformer(split_multivalued_df, kw_args={'split_columns': split_columns})),
        ('merge', FunctionTransformer(merge_dfs, kw_args={'df_b': itens, 'df_a_key': df_a_key, 'df_b_key': df_b_key}))
    ]
)

In [5]:
new_df = pipeline.transform(treino)
knn_df = new_df.copy()

In [6]:
knn_drop = ['timestampHistory', 'numberOfClicksHistory','timeOnPageHistory','pageVisitsCountHistory', 'page', 'url', 'modified', 'body', 'caption']
date_to_format = 'issued'
number_to_format = 'scrollPercentageHistory'
knn_df = drop_columns(knn_df, knn_drop)
knn_df = adjust_date(knn_df, date_to_format)
knn_df = adjust_number_columns(knn_df, number_to_format)
knn_df.head()

Unnamed: 0,userId,history,scrollPercentageHistory,issued,title
538986,5bbe5d34f92fbbc8854cd5468ecc0db28a25e54388bec3...,esid:conteudo_editorial_g1#materia#https://esp...,24.48,2015-11-17 08:04:41,Conexões da Lava Jato
29796,d81b7e5879cbf74a447aa1e20f584d4eba30604ede2365...,esid:conteudo_editorial_g1#materia#http://espe...,37.38,2015-12-01 23:04:41,Calculadora de Combustível
341736,335fe8c2c4282d3ca8c33f01b412374f1d786b2703b385...,esid:conteudo_editorial_g1#materia#http://espe...,82.23,2015-12-01 23:04:41,Calculadora de Combustível
208894,9225d6cdd374256d25ee877b6c410b82446a73866074b8...,f1640c36-2c34-4cf4-bd5e-bdc73363b15d,29.32,2015-12-04 18:17:03,"Australiano flagra lagarto de 1,5 m escalando ..."
378396,55ef18f56791794a0e52eb6e4db69e62be29fa2884288e...,f1640c36-2c34-4cf4-bd5e-bdc73363b15d,54.27,2015-12-04 18:17:03,"Australiano flagra lagarto de 1,5 m escalando ..."


### Colaborative Filtering Test

In [8]:
# Create a sparse matrix
row = new_df['userId'].astype('category').cat.codes
col = new_df['title'].astype('category').cat.codes
data = new_df['scrollPercentageHistory'].astype('float32')

sparse_matrix = csr_matrix((data, (row, col)))

print(sparse_matrix)


<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 577942 stored elements and shape (577942, 78489)>
  Coords	Values
  (0, 4162)	46.22999954223633
  (1, 1130)	15.59000015258789
  (2, 33749)	10.479999542236328
  (3, 73553)	73.83000183105469
  (4, 3317)	23.459999084472656
  (5, 4298)	56.08000183105469
  (6, 75419)	61.41999816894531
  (7, 10454)	25.239999771118164
  (8, 15740)	53.369998931884766
  (9, 43863)	57.04999923706055
  (10, 16069)	26.459999084472656
  (11, 26714)	22.219999313354492
  (12, 47315)	33.209999084472656
  (13, 66299)	46.58000183105469
  (14, 53084)	40.630001068115234
  (15, 18313)	35.369998931884766
  (16, 77601)	38.41999816894531
  (17, 63154)	45.709999084472656
  (18, 13121)	32.45000076293945
  (19, 5853)	42.25
  (20, 32988)	16.8799991607666
  (21, 60373)	25.229999542236328
  (22, 56738)	16.469999313354492
  (23, 51319)	57.310001373291016
  (24, 18074)	13.829999923706055
  :	:
  (577917, 69496)	23.920000076293945
  (577918, 61434)	24.139999389648438
  (577

In [9]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

In [10]:
# Remove rows with no interactions
sparse_matrix = sparse_matrix[sparse_matrix.getnnz(axis=1) > 0]

# Normalize the sparse matrix row-wise
sparse_matrix_normalized = normalize(sparse_matrix, norm='l2', axis=1)

In [11]:
# Train a KNN model
model = NearestNeighbors(metric='cosine')
model.fit(sparse_matrix_normalized)

In [12]:
# Find neighbors for a user (e.g., the first user in the matrix)
distances, indices = model.kneighbors(sparse_matrix[0], n_neighbors=3000)

print("Distances:", distances)
print("Indices of neighbors:", indices)

Distances: [[0. 0. 0. ... 1. 1. 1.]]
Indices of neighbors: [[  5948   4616   3962 ... 577920 577905 577913]]


In [13]:
# Filter indices where distances are neither 0 nor 1
filtered_indices = [indices[0][i] for i in range(len(distances[0])) if distances[0][i] != 0 and distances[0][i] != 1]

print("Filtered User IDs (distances not 0 or 1):", filtered_indices)

Filtered User IDs (distances not 0 or 1): []


In [14]:
# Get the mapping from matrix index to userId
user_id_mapping = pd.Series(new_df['userId'].astype('category').cat.categories)

# Get the userId for index 500 and 577915
user_id_500 = user_id_mapping.iloc[500]
user_id_91104 = user_id_mapping.iloc[91104]

print("User ID for index 500:", user_id_500)
print("User ID for index 91104:", user_id_91104)


User ID for index 500: 003a7e970c4fa0a05483723b3139a68b86367eef3d03154fedb610119d41817b
User ID for index 91104: 28607d5a04b09f40921c1d6ee4e14e276f462a086a2c1e3b1a3e059daa16f403


In [15]:
# Get news consumed by each user
news_500 = new_df[new_df['userId'] == user_id_500]
news_91104 = new_df[new_df['userId'] == user_id_91104]

print("News consumed by User 500:\n", news_500[['title', 'scrollPercentageHistory']])
print("News consumed by User 91104:\n", news_91104[['title', 'scrollPercentageHistory']])


News consumed by User 500:
                                                     title  \
339684  Após Covid-19, professora de educação física l...   

       scrollPercentageHistory  
339684                   59.18  
News consumed by User 91104:
                                                     title  \
336454  Após Covid-19, professora de educação física l...   

       scrollPercentageHistory  
336454                   24.04  


In [16]:
# Find common news titles
common_titles = set(news_500['title']).intersection(set(news_91104['title']))

# Compare scroll percentage for common news
comparison = news_500[news_500['title'].isin(common_titles)].merge(
    news_91104[news_91104['title'].isin(common_titles)],
    on='title',
    suffixes=('_500', '_577915')
)

print("Comparison of common news:\n", comparison[['title', 'scrollPercentageHistory_500', 'scrollPercentageHistory_577915']])


Comparison of common news:
                                                title  \
0  Após Covid-19, professora de educação física l...   

  scrollPercentageHistory_500 scrollPercentageHistory_577915  
0                       59.18                          24.04  


In [17]:
# Get news consumed by each user
news_500 = new_df[new_df['userId'] == user_id_500]
news_91104 = new_df[new_df['userId'] == user_id_91104]

# Get the titles consumed by both users
titles_500 = set(news_500['title'])
titles_91104 = set(news_91104['title'])

print("Titles consumed by User 500:", titles_500)
print("Titles consumed by User 91104:", titles_91104)


Titles consumed by User 500: {'Após Covid-19, professora de educação física luta para recuperar movimento de perna:  Como se não tivesse esta parte '}
Titles consumed by User 91104: {'Após Covid-19, professora de educação física luta para recuperar movimento de perna:  Como se não tivesse esta parte '}


In [18]:
# Find titles consumed by user 577915 but not user 500
unique_titles_91104 = titles_91104 - titles_500

# Get the news rows for these unique titles
unique_titles_91104 = news_91104[news_91104['title'].isin(unique_titles_91104)]

print("News consumed by User 577915 but not User 500:\n", unique_titles_91104[['title', 'scrollPercentageHistory']])


News consumed by User 577915 but not User 500:
 Empty DataFrame
Columns: [title, scrollPercentageHistory]
Index: []


### Content Based recomendation system Test

In [7]:
## https://chatgpt.com/c/67704f84-b518-8008-840c-2a398638c9b5

In [9]:
embbegindg_df_crud = itens.copy()
embbegindg_df_crud.head()

Unnamed: 0,page,url,issued,modified,title,body,caption
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic..."


In [28]:
# creating base dataset
embbegindg_df = embbegindg_df_crud[['page', 'title', 'url']]
embbegindg_df.head()

Unnamed: 0,page,title,url
0,13db0ab1-eea2-4603-84c4-f40a876c7400,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,http://g1.globo.com/am/amazonas/noticia/2022/0...
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,Linguajar dos santarenos é diferenciado e chei...,http://g1.globo.com/pa/santarem-regiao/noticia...
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,Ex-premiê Shinzo Abe morre após ser baleado no...,http://g1.globo.com/mundo/noticia/2022/07/08/e...
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,"Relator no STF, Fachin vota contra marco tempo...",http://g1.globo.com/politica/noticia/2021/09/0...
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"\nApós 2 votos, pedido de vista suspende julga...",http://g1.globo.com/politica/noticia/2021/09/1...


### Enriching base dataset with embbedings

In [30]:
embbegindg_df['cleaned_title'] = embbegindg_df['title'].str.strip()
embbegindg_df.drop(columns=['title'], axis=1, inplace=True)
embbegindg_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embbegindg_df['cleaned_title'] = embbegindg_df['title'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embbegindg_df.drop(columns=['title'], axis=1, inplace=True)


Unnamed: 0,page,url,cleaned_title
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,Caso Bruno e Dom: 3º suspeito tem prisão tempo...
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,Linguajar dos santarenos é diferenciado e chei...
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,Ex-premiê Shinzo Abe morre após ser baleado no...
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,"Relator no STF, Fachin vota contra marco tempo..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,"Após 2 votos, pedido de vista suspende julgame..."


In [12]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Load a pre-trained sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [31]:
# Generate embeddings for each title
embbegindg_df['title_embedding'] = list(model.encode(embbegindg_df['cleaned_title'], convert_to_tensor=False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embbegindg_df['title_embedding'] = list(model.encode(embbegindg_df['cleaned_title'], convert_to_tensor=False))


In [32]:
embbegindg_df.head()

Unnamed: 0,page,url,cleaned_title,title_embedding
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,Linguajar dos santarenos é diferenciado e chei...,"[0.05066423, 0.053637918, -0.06246449, -0.0591..."
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,Ex-premiê Shinzo Abe morre após ser baleado no...,"[-0.04080393, 0.08898491, -0.041833814, -0.010..."
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,"Relator no STF, Fachin vota contra marco tempo...","[-0.02016749, 0.13216536, 0.03784579, -0.01504..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,"Após 2 votos, pedido de vista suspende julgame...","[0.031508345, 0.10205304, 0.03989872, 0.017708..."


1 - Checking the G1 portal I can see that there are abou 20 categories for the news

2 - Unfortunately I am not fully able to confirm that these news are in these categories

3 - Checking th URL luckily I can see a categorization which can be realy usefull for the recomendation system

In [33]:
print(itens['url'][0])
print(itens['url'][2])
print(itens['url'][3])

http://g1.globo.com/am/amazonas/noticia/2022/06/18/caso-bruno-e-dom-3o-suspeito-tem-prisao-temporaria-decretada-pela-justica-do-am.ghtml
http://g1.globo.com/mundo/noticia/2022/07/08/ex-premie-shinzo-abe-morre-apos-ser-baleado-no-japao-diz-nhk.ghtml
http://g1.globo.com/politica/noticia/2021/09/09/relator-no-stf-fachin-vota-contra-marco-temporal-para-demarcacao-de-terras-indigenas.ghtml


In [113]:
def embbed_title(df: pd.DataFrame, model_to_embbed: SentenceTransformer, column_to_embbed: str, target_column: str) -> pd.DataFrame:
    # Clean and process the title column
    df['cleaned_title'] = df['title'].str.strip()
    df = df.drop(columns=['title'], axis=1)
    
    # Generate embeddings and add as a new column
    df[target_column] = list(model_to_embbed.encode(df['cleaned_title'].tolist(), convert_to_tensor=False))
    
    return df

def create_crud_categories(df:pd.DataFrame) -> pd.DataFrame:
    def extract_substring(url):
        try:
            start_index = url.index(".com/") + len(".com/")
            end_index = url.index("/20")
            return url[start_index:end_index]
        except ValueError:
            return ""  # Handle cases where ".com/" or "/20" is not found
    
    df['crud_categories'] = df['url'].apply(extract_substring)
    return df


# Function split crud categories
def split_crud_categories(df:pd.DataFrame) -> pd.DataFrame:
    df['grouped_categories'] = df['crud_categories'].str.split('/')
    return df


def extract_categories(df):
    # Add 'category' and 'sub_category' columns
    embbegindg_df['grouped_categories'] = embbegindg_df['grouped_categories'].to_list()
    df["category"] = df["grouped_categories"].apply(lambda x: x[1] if len(x) > 1 else None)
    df["sub_category"] = df["grouped_categories"].apply(lambda x: x[-1] if x else None)
    return df


#### Pipeline Test

For now, to make the test faster I am applying a subset of 200 records to my pipeline, to finish the tests is would be good to apply with the whole set (will take aroun 10 minutes to complete)

In [124]:
model_pipeline = SentenceTransformer('all-MiniLM-L6-v2')
itens_df = itens.copy()
column_to_embbed = 'cleaned_title'
target_column = 'embbed_title'
itens_columns_to_drop = ['modified', 'body', 'caption']
split_string='/', 
column_reference='crud_categories'

In [121]:
df_test_pipeline = itens_df.head(200)

In [122]:
itens_pipeline = Pipeline(
    steps=[
        ('create_embbeding', FunctionTransformer(
            embbed_title, 
            kw_args={
                'model_to_embbed': model_pipeline,
                'column_to_embbed': column_to_embbed,
                'target_column': target_column
            })),
        ('create_crud_categories', FunctionTransformer(create_crud_categories)),
        ('create_list_of_categories', FunctionTransformer(split_crud_categories)),
        ('create_category_and_subcategory', FunctionTransformer(extract_categories)),
        ('drop_unused_columns', FunctionTransformer(
            drop_columns, kw_args={'columns_to_drop': itens_columns_to_drop}
        ))

    ]
)

In [123]:
test_df = itens_pipeline.transform(df_test_pipeline)
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['title'].str.strip()


Unnamed: 0,page,url,issued,cleaned_title,embbed_title,crud_categories,grouped_categories,category,sub_category
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",am/amazonas/noticia,"[am, amazonas, noticia]",amazonas,noticia
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,Linguajar dos santarenos é diferenciado e chei...,"[0.05066423, 0.053637918, -0.06246449, -0.0591...",pa/santarem-regiao/noticia,"[pa, santarem-regiao, noticia]",santarem-regiao,noticia
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,"[-0.04080393, 0.08898491, -0.041833814, -0.010...",mundo/noticia,"[mundo, noticia]",noticia,noticia
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,"Relator no STF, Fachin vota contra marco tempo...","[-0.02016749, 0.13216536, 0.03784579, -0.01504...",politica/noticia,"[politica, noticia]",noticia,noticia
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,"Após 2 votos, pedido de vista suspende julgame...","[0.031508345, 0.10205304, 0.03989872, 0.017708...",politica/noticia,"[politica, noticia]",noticia,noticia
