## Libs

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from typing import List
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

## Data Preparation

### Parquet Imports

In [4]:
users_history = pd.read_parquet('local/user_full_features.parquet')
itens = pd.read_parquet('local/itens_text.parquet')
test_predictions = pd.read_parquet('local/teste.parquet')
content_embbeding = pd.read_parquet('local/content_embedding.parquet')
colab_filter_df = pd.read_parquet('local/acessos_filtrados.parquet')

### Train Data Preparation

In [3]:
content_embbeding.head(1)

Unnamed: 0,page,body,cleaned_body,content_embbeding
0,13db0ab1-eea2-4603-84c4-f40a876c7400,"Após audiência de custódia, a Justiça do Amazo...","Após audiência de custódia, a Justiça do Amazo...","[0.020625813, 0.05944126, -0.052852374, -0.062..."


In [6]:
itens.head(1)

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."


In [7]:
print('User Columns')
print(users_history.columns)
print('')
print('News Columns')
print(itens.columns)

User Columns
Index(['userId', 'userType', 'historySize', 'history', 'timestampHistory',
       'numberOfClicksHistory', 'timeOnPageHistory', 'scrollPercentageHistory',
       'pageVisitsCountHistory', 'timestampHistory_new', 'timestamp', 'date',
       'days_since_click', 'day_of_week', 'hour_of_day', 'year',
       'week_of_year', 'time_normalized', 'time_decay_weight',
       'time_on_page_minutes', 'engagement_score_pca', 'engagement_score'],
      dtype='object')

News Columns
Index(['page', 'url', 'issued', 'modified', 'title', 'body', 'caption',
       'title_sentiment_label', 'title_sentiment_score',
       'caption_sentiment_label', 'caption_sentiment_score', 'cleaned_title',
       'embbed_title'],
      dtype='object')


In [8]:
# Need to generate a date field on itens dataset too
itens['issued'] = pd.to_datetime(itens['issued'])
itens['date'] = itens['issued'].dt.date
itens.head(1)

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,date
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104...",2022-06-18


In [9]:
user_consumption = users_history[['userId', 'history', 'date']]
news_data = itens[['page', 'embbed_title']]
user_news_history = pd.merge(user_consumption, news_data, how='left', left_on='history', right_on='page')
user_news_history.head(2)

Unnamed: 0,userId,history,date,page,embbed_title
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022-07-06,c8aab885-433d-4e46-8066-479f40ba7fb2,"[-0.035558525, 0.0103199985, -0.017475873, -0...."
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022-07-06,68d2039c-c9aa-456c-ac33-9b2e8677fba7,"[-0.048941117, 0.024516182, -0.026205998, -0.0..."


In [10]:
sort = ['userId', 'date']
cosine_base_df = user_news_history.groupby(['userId', 'date'])['embbed_title'].mean().reset_index()
cosine_base_df.sort_values(by=sort, ascending=False, inplace=True, ignore_index=True)
cosine_base_df.head()

Unnamed: 0,userId,date,embbed_title
0,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-08,"[0.04049880802631378, -0.013461118564009666, -..."
1,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-04,"[-0.03390497714281082, 0.07311740517616272, -0..."
2,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-02,"[0.010935629718005657, 0.04613644629716873, -0..."
3,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-08-01,"[-0.02945239096879959, 0.06193753704428673, -0..."
4,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,2022-07-28,"[0.018855249509215355, 0.09517772495746613, 0...."


In [12]:
news_base = itens[['page', 'embbed_title', 'date']]
news_map = {u: i for i, u in enumerate(news_base['page'])}
news_index_to_page = {v: k for k, v in news_map.items()}
news_base['news_idx'] = news_base['page'].map(news_map)
news_base['embbed_title'] = news_base['embbed_title'].to_numpy()
new_cosine_df = news_base[['news_idx', 'date', 'embbed_title']]
new_cosine_df.head()

Unnamed: 0,news_idx,date,embbed_title
0,0,2022-06-18,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."
1,1,2019-06-20,"[0.05066423, 0.053637918, -0.06246449, -0.0591..."
2,2,2022-07-08,"[-0.04080393, 0.08898491, -0.041833814, -0.010..."
3,3,2021-09-09,"[-0.02016749, 0.13216536, 0.03784579, -0.01504..."
4,4,2021-09-15,"[0.031508345, 0.10205304, 0.03989872, 0.017708..."


### Test Data Preparation

In [3]:
test_predictions.head(1)

Unnamed: 0,userId,acessos_futuros
0,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,e67c8cdf-3c55-4399-a864-3c1591225296


In [13]:
test_df = pd.merge(test_predictions, itens, how='left', left_on='acessos_futuros', right_on='page')
test_df = test_df[['userId', 'acessos_futuros', 'date']]
test_df.head(10)

Unnamed: 0,userId,acessos_futuros,date
0,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,e67c8cdf-3c55-4399-a864-3c1591225296,2022-07-22
1,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,68ed45c8-71b7-4b88-bcde-1695d741aa42,2022-07-05
2,3f3491a8fc9ed10caad74f95d22efcff9537bcaa631e6a...,6f81f339-dc75-4cb1-b12f-d20a344ae64d,2022-05-03
3,1505326617b9465f6e13eb1d0d9782bff2af61822a7bc7...,61e07f64-cddf-46f2-b50c-ea0a39c22050,2022-07-08
4,9ade38ffe62f55863100f505b9b9be170f7b50c36ca6b5...,esid:conteudo_editorial_g1#materia#https://esp...,2021-01-07
5,2bf42dcfb112d488aa389ac1c2f088912d8fd16e1128ab...,ecc37a22-b730-4e3a-bc87-c3ba3403acbc,2022-08-07
6,2bf42dcfb112d488aa389ac1c2f088912d8fd16e1128ab...,9a4bb232-f1b6-4cd1-ba6c-c4696952153b,2022-08-07
7,2bf42dcfb112d488aa389ac1c2f088912d8fd16e1128ab...,a36c98b5-f159-48f8-9f5a-1fc6ea9956c8,2022-08-07
8,2bf42dcfb112d488aa389ac1c2f088912d8fd16e1128ab...,d031af1b-f939-47c1-a589-e6d691b66d91,2022-08-05
9,2bf42dcfb112d488aa389ac1c2f088912d8fd16e1128ab...,bf257382-74fb-4392-ad6a-143240e39f81,2022-08-05


In [34]:
print(test_df['acessos_futuros'][4])

esid:conteudo_editorial_g1#materia#https://especiais.g1.globo.com/bemestar/coronavirus/estados-brasil-mortes-casos-media-movel/


In [35]:
news_id = 'esid:conteudo_editorial_g1#materia#https://especiais.g1.globo.com/bemestar/coronavirus/estados-brasil-mortes-casos-media-movel/'
itens_test = itens[itens['page'] == news_id]
itens_test.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title,date
148623,esid:conteudo_editorial_g1#materia#https://esp...,http://especiais.g1.globo.com/bemestar/coronav...,2021-01-07 17:29:00+00:00,2021-01-07 17:29:00+00:00,Mortes e casos conhecidos de coronavírus nos e...,"Onde as mortes estão subindo, em estabilidade ...","Onde as mortes estão subindo, em estabilidade ...",NEGATIVE,0.831829,NEGATIVE,0.817671,Mortes e casos conhecidos de coronavírus nos e...,"[0.02349417, 0.029535519, -0.017670218, -0.022...",2021-01-07


## Cosine Similarity Function 1

In [54]:
def recomend_for_user(user_mean_embedding, news_dataset, top_n=20):
    user_mean_embedding = np.array(user_mean_embedding).reshape(1, -1)

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(
        np.vstack(news_dataset["embbed_title"]), 
        user_mean_embedding
    ).flatten()

    # Create a new column in the DataFrame
    news_dataset['cosine_similarity'] = cosine_similarities

    recommended_articles = news_dataset.sort_values(by="cosine_similarity", ascending=False)
    return recommended_articles.head(top_n)

In [69]:
user_index = 5
user = test_df['userId'][user_index]
date = test_df['date'][user_index]
mean_embedding = cosine_base_df['embbed_title'][(cosine_base_df['userId']==user) & (cosine_base_df['date']==date)].to_numpy()[0]
# mean_embedding = cosine_base_df['embbed_title'][(cosine_base_df['userId']==user) & (cosine_base_df['date']==(date - timedelta(days=1)))].to_numpy()[0]
news_data = new_cosine_df[new_cosine_df['date']==date]
news_data.head()
recommendations = recomend_for_user(user_mean_embedding=mean_embedding, news_dataset=news_data)

In [70]:
recommendations['news_page'] = recommendations['news_idx'].map(news_index_to_page)
recommendations.head(20)

Unnamed: 0,news_idx,date,embbed_title,cosine_similarity,news_page
223063,223063,2022-08-07,"[0.004996886, 0.09067107, -0.03569399, -0.0021...",0.728295,8a1a6d66-3f56-4912-bd7b-15799cf88ff6
165787,165787,2022-08-07,"[0.027235927, 0.111826554, -0.0331444, -0.0037...",0.706082,b45b4770-bea9-435b-b7e0-fd1c4aeea65d
177436,177436,2022-08-07,"[0.049923375, 0.05646476, -0.034790017, -0.039...",0.67821,5b9b55cc-3691-4fd4-b9a2-3d0980448306
2453,2453,2022-08-07,"[0.06949148, 0.042263545, -0.036132116, 0.0418...",0.676604,ee836140-8538-4d68-89bf-fd0e063f8526
97603,97603,2022-08-07,"[0.05719966, 0.025742006, -0.053409074, -0.001...",0.675968,da41f9cd-1a3e-4b3f-87b8-a1300bdb89ea
11491,11491,2022-08-07,"[-0.019753207, 0.079257116, -0.015629806, -0.0...",0.663495,ed9b4472-df91-43bd-97e4-b45bcef2c1d5
120902,120902,2022-08-07,"[0.016126223, 0.055767562, -0.07354401, 0.0056...",0.650003,8c41ef47-52f8-44b8-8bf3-e36cb5165913
146981,146981,2022-08-07,"[0.039399534, 0.041705105, -0.03561073, -0.006...",0.647363,e5bcc305-edcc-4aec-9e4e-2869ab32ad20
233415,233415,2022-08-07,"[0.09870035, 0.078076564, 0.0038179946, 0.0094...",0.644061,6c062e00-7cd2-456b-82a3-8618b15b9f62
18055,18055,2022-08-07,"[-0.01707877, 0.13595459, -0.018904634, -0.052...",0.643883,18ed1b5f-17b5-4c24-a6dd-0a95010df325


#e67c8cdf-3c55-4399-a864-3c1591225296 = 0

#68ed45c8-71b7-4b88-bcde-1695d741aa42 = 1

#6f81f339-dc75-4cb1-b12f-d20a344ae64d = 2

#61e07f64-cddf-46f2-b50c-ea0a39c22050 = 3

#ecc37a22-b730-4e3a-bc87-c3ba3403acbc = 5

## Class Content Recommender

In [78]:
import pandas as pd
from typing import List
from datetime import timedelta
from datetime import date

class ContentRecomender:

    def filter_news_by_date(self, news:pd.DataFrame, date:date) -> pd.DataFrame:
        """
        Will receive the news dataset, then filter based on date 
        to pass only recent news to cosine similarity calculation.
        Period: D-3 to current
        Input:
            1 - date: a date value in format YYYY-MM-DD
            2 - news: a pandas dataframe with all news base

        Output:
            1 - filtered_news: a pandas dataframe with only with current news
        """
        start_date = date - timedelta(days=3)
        filtered_news = news[(news['date'] >= start_date) & (news['date'] <= date)]
        return filtered_news

    def calculate_cosine_similarity(self, reference_news:np.array, filtered_news:pd.DataFrame, top_k:int = 3):
        """
        This function calculate the consise similarity 
        between one reference news and the filtered news dataset.
        Both reference news and filtered news dataset calculation are based on a 
        384 dimension embbeding on the body (content) of the news article
        Input:
            1 - reference_news: a 384 dim vector with the reference 
        article we want to suggest similar content
            2 - filtered_news: a pandas dataframe with only with current news
            3 - top_k: an int value to detemine the total of similar content to suggest
        Output:
            1 - a list containing the top_k similar news
        """
        reference_news = reference_news.reshape(1, -1)

        # Calculate cosine similarity
        cosine_similarities = cosine_similarity(
            np.vstack(filtered_news["content_embbeding"]), 
            reference_news
        ).flatten()

        # Create a new column in the DataFrame
        filtered_news['cosine_similarity'] = cosine_similarities

        recommended_articles = filtered_news.sort_values(by="cosine_similarity", ascending=False)
        return recommended_articles[['page','cosine_similarity']].head(top_k)

    def recommend(self, user_id:str, reference_news:pd.DataFrame):
        """
        Function responsible for orchestrate the recomendations.
        It will receive a user and a group os reference news for that user,
        then get the recomendations and give it back to the requester.

        Input:
            1 - reference_news: dataframe of news relevant to recomend similar content

        output:
            2 - recommendation: json containing the recomendations 
        for that user based on similar content
        """
        #replace below code by the azure conection to retrieve this parquet
        news_base = pd.read_parquet('local/content_embedding_cleaned.parquet')
        result:List[pd.DataFrame] = []
        for index, row in reference_news.iterrows():
            page = row['history']
            reference_content = news_base['content_embbeding'][news_base['page'] == page].to_numpy()[0]
            filtered_news = self.filter_news_by_date(news_base, row['date'])
            result.append(self.calculate_cosine_similarity(reference_content,filtered_news))
        
        recommendations = pd.concat(result)
        return recommendations
            

### Test filter_news_by_date

In [25]:
test = pd.read_parquet('local/user_full_features.parquet')

In [26]:
test[['userId', 'date']].head()

Unnamed: 0,userId,date
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,2022-07-06
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,2022-07-06
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,2022-07-06
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,2022-07-01
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,2022-07-02


In [21]:
specific_date = date(year=2022, month=7, day=6)
rec = ContentRecomender()
result = rec.filter_news_by_date(test, specific_date)

In [24]:
result[ 'date'].unique()

array([datetime.date(2022, 7, 6), datetime.date(2022, 7, 3),
       datetime.date(2022, 7, 4), datetime.date(2022, 7, 5)], dtype=object)

### Test calculate_cosine_similarity

In [27]:
test2 = pd.read_parquet('local/content_embedding_cleaned.parquet')

In [28]:
test2.head()

Unnamed: 0,page,content_embbeding
0,13db0ab1-eea2-4603-84c4-f40a876c7400,"[0.020625813, 0.05944126, -0.052852374, -0.062..."
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,"[0.008854383, 0.008774249, 0.0036499822, -0.09..."
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,"[-0.009452097, 0.04437582, -0.12068652, -0.081..."
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,"[-0.034880683, 0.11428494, 0.007874168, -0.049..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"[0.017652063, 0.1163496, -0.0071748984, -0.048..."


In [41]:
reference_news = test2['content_embbeding'][0]
filtered_news = test2.iloc[2:22]
rec2 = ContentRecomender()
result = rec2.calculate_cosine_similarity(reference_news, filtered_news)

In [42]:
result.head()

Unnamed: 0,page,cosine_similarity
21,6e9150e5-57f8-4762-b5ea-d82164c91788,0.571627
17,dc4aea98-84c9-47cf-a173-cb58820d5f28,0.5687
16,71cdc0aa-902a-4b82-a518-c6e4411b5fac,0.554769
14,e5323b08-013b-408d-91c5-9c03460b689d,0.548929
11,da15522f-f679-4ff4-842c-9a212d5af519,0.544155


### Test recommend

In [61]:
test3 = pd.read_parquet('local/user_full_features.parquet')
news = pd.read_parquet('local/itens_text.parquet')

In [67]:
user_id = 'f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069'
test3 = test3[['userId','history','date']]
test3 = test3[test3['userId']==user_id]
test3.head()

Unnamed: 0,userId,history,date
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,2022-07-06
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,2022-07-06
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,2022-07-06


In [None]:
news['issued'] = pd.to_datetime(news['issued'])
news['date'] = news['issued'].dt.date
news = news[['page', 'date']]
news_base = pd.merge(news, test2, on='page', how='inner')
news_base.head()

Unnamed: 0,page,date,content_embbeding
0,13db0ab1-eea2-4603-84c4-f40a876c7400,2022-06-18,"[0.020625813, 0.05944126, -0.052852374, -0.062..."
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,2019-06-20,"[0.008854383, 0.008774249, 0.0036499822, -0.09..."
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,2022-07-08,"[-0.009452097, 0.04437582, -0.12068652, -0.081..."
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,2021-09-09,"[-0.034880683, 0.11428494, 0.007874168, -0.049..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,2021-09-15,"[0.017652063, 0.1163496, -0.0071748984, -0.048..."


In [63]:
out_filepath = 'local/content_embedding_cleaned.parquet'
news_base.to_parquet(out_filepath, index=False)

In [79]:
rec3 = ContentRecomender()
result = rec3.recommend(user_id, test3)

In [80]:
result.head(10)

Unnamed: 0,page,cosine_similarity
216648,54f6bbce-e5fa-489b-ad5c-22c16cfba652,0.776193
204214,eab4a8a1-8ba0-444b-87cc-494f544d9fe5,0.763773
117974,671771d0-b633-49b8-b962-32cbb4f7df25,0.761972
117769,4f578606-54f5-4585-89be-8f5c44163ed3,0.711044
191303,cf3e4825-4a54-4460-b458-938c973adafc,0.692773
150088,2dc077e2-33ce-4505-8b20-d9c440aa102a,0.684891
117974,671771d0-b633-49b8-b962-32cbb4f7df25,0.850075
72385,d1ae6fc9-e82c-41ec-90d5-793c8f3e2eb7,0.772199
70648,189a3fe6-058d-4415-b550-eeb791d0cdc3,0.725925


In [1]:
import joblib

model_path = 'C:/Projects/Globo_Recommendation_FIAP/mlruns/453918242483694506/da20ee974e4d403b8bdc4fbdf006db3f/artifacts/colab_filter_model/model.pkl'

# Load the model
model = joblib.load(model_path)


In [3]:
colab_filter_df[['userId', 'history', 'flag_read']].head()

NameError: name 'colab_filter_df' is not defined

In [6]:
print(colab_filter_df['userId'][0])

0004e1ddec9a5d67faa56bb734d733628a7841c10c7255c0c507b7d1d4114f06


In [2]:
uid= '0004e1ddec9a5d67faa56bb734d733628a7841c10c7255c0c507b7d1d4114f06'
iid='43b8e36b-5a0b-4c76-9adf-fb5366dbc330'
# Use the model for predictions
prediction = model.predict(uid, iid)
print(prediction)

user: 0004e1ddec9a5d67faa56bb734d733628a7841c10c7255c0c507b7d1d4114f06 item: 43b8e36b-5a0b-4c76-9adf-fb5366dbc330 r_ui = None   est = 1.00   {'was_impossible': False}


In [6]:
print(type(model))

<class 'surprise.prediction_algorithms.matrix_factorization.SVD'>


In [3]:
print(type(prediction))


<class 'surprise.prediction_algorithms.predictions.Prediction'>
