In [1]:
#disabe annoying warnings
import warnings
warnings.filterwarnings('ignore')
#imports
import numpy as np
import pandas as pd
import altair as alt
import textstat
import json

alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

Sources: [Recommender](https://github.com/recommenders-team/recommenders/tree/main/recommenders) - [MIND github](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md) - [Leaderboard MIND](https://paperswithcode.com/sota/news-recommendation-on-mind) - [TECNICAL REPORT OF MIND COMPETITION WINNER](https://msnews.github.io/competition.html) - [RecBole models](https://recbole.io/docs/user_guide/model_intro.html) - [medium: build rs w Bert](http://webcache.googleusercontent.com/search?q=cache:https://medium.com/mlearning-ai/build-news-recommendation-model-using-python-bert-and-faiss-10ea8c65e6c&sca_esv=585540903&strip=1&vwsrc=0) w/ [notebook](https://colab.research.google.com/drive/1uuQaagWNh7gexSQhchpOgGyPKYK2e6SU#scrollTo=ooBElUsT53JO) - [NRMS model](https://github.com/recommenders-team/recommenders/blob/main/examples/00_quick_start/nrms_MIND.ipynb) - [DLRM model](https://nvidia-merlin.github.io/HugeCTR/v3.5/notebooks/news-example.html) - [Kaggle](https://www.kaggle.com/code/accountstatus/mind-microsoft-news-recommendation-v2#Importing-The-Packages)

Need for:
- news **articles** representation, e.g. 
1. concat embedding (via CNN & Attention) of title + categories + subcat [Okura et al., 2017](https://aclanthology.org/P19-1033.pdf) | 
2. concat embedding (via KCNN) of titles + entity embedding [Wang et al., 2018](https://dl.acm.org/doi/pdf/10.1145/3178876.3186175)
- user **interest** representation, e.g. 
1. historical clicked news + GRU [Okura et al., 2017](https://aclanthology.org/P19-1033.pdf)
2. "       " " + KCNN

In [36]:
news_train = 'data/MINDsmall_train/news.tsv'
behavior_train = 'data/MINDsmall_train/behaviors.tsv'
entity_train = 'data/MINDsmall_train/entity_embedding.vec'
relation_train = 'data/MINDsmall_train/relation_embedding.vec'
#-------------------------------------------
news_test = 'data/MINDsmall_dev/news.tsv'
behavior_test = 'data/MINDsmall_dev/behaviors.tsv'
entity_test = 'data/MINDsmall_dev/entity_embedding.vec'
relation_test = 'data/MINDsmall_dev/relation_embedding.vec' 

def load_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities"]
    
    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    else:
        return pd.read_csv(path, sep='\t', header=None)
    
    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    return df

news_train, news_test, behavior_train, behavior_test = map(load_df, [news_train, news_test, behavior_train, behavior_test])
entity_train, relation_train, entity_test, relation_test = map(load_df, [entity_train, relation_train, entity_test, relation_test])
print('MIND-small:')
print(f"{'Dataset':<15} {'Train shape':<20} {'Test shape'}")
print(f"{'-'*50}")
print(f"{'news':<15} {str(news_train.shape):<20} {news_test.shape}")
print(f"{'behavior':<15} {str(behavior_train.shape):<20} {behavior_test.shape}")
print(f"{'entity':<15} {str(entity_train.shape):<20} {entity_test.shape}")
print(f"{'relation':<15} {str(relation_train.shape):<20} {relation_test.shape}")

MIND-small:
Dataset         Train shape          Test shape
--------------------------------------------------
news            (51282, 8)           (42416, 8)
behavior        (156965, 5)          (73152, 5)
entity          (26904, 102)         (22893, 102)
relation        (1091, 102)          (1091, 102)


# News

In [3]:
print("news_train: ")
display(news_train.head(3)) 

news_train: 


Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [4]:
#unique values
print(pd.DataFrame({'Train': news_train.nunique(), 'Test': news_test.nunique()}))


                   Train   Test
News ID            51282  42416
Category              17     17
SubCategory          264    257
Title              50434  41823
Abstract           47309  39470
URL                51281  42416
Title Entities     34472  28800
Abstract Entities  36277  29889


In [5]:
#return all unique subcategory with string 'politics' in the name
string_politics = news_train[news_train['SubCategory'].str.contains('politic', case=False)]['SubCategory'].unique()
#print how many datasample in t hose two categories:
print(news_train[news_train['SubCategory'].isin(string_politics)]['SubCategory'].value_counts())


SubCategory
newspolitics         2826
newsworldpolitics       5
Name: count, dtype: int64


In [6]:
#print abstract of 10 samples from subategory x
print(news_train[news_train['SubCategory'] == 'newsworld']['Abstract'].sample(10).values)

['The 63-year-old English teacher was living in the Dominican Republic, and was found tortured in her own apartment after an apparent robbery.'
 'More Russian military police arrive in Syria under peace deal with Turkey'
 'Japan typhoon death toll rises to 66 as hopes for missing fade'
 'North Korea has lodged multiple protests over planned joint U.S.-South Korean war games, and said Wednesday it would respond to force in kind.'
 "WASHINGTON (AP)   Targeting Turkey's economy, President Donald Trump announced sanctions Monday aimed at restraining the Turks' assault against Kurdish fighters and civilians in Syria -- an assault Turkey began after Trump announced he was moving U.S. troops out of the way. Meanwhile, the Americans were scrambling for Syria's exits, a move criticized at home and abroad as opening the door to a resurgence of the Islamic State group whose violent..."
 'The commander of a Kurdish-led alliance urges the US and Russia to "rein in the Turks".'
 "Russia used social 

In [7]:
#missing values
print(pd.DataFrame({'Train': news_train.isna().sum(), 'Test': news_test.isna().sum()}))

                   Train  Test
News ID                0     0
Category               0     0
SubCategory            0     0
Title                  0     0
Abstract            2666  2021
URL                    0     0
Title Entities         3     2
Abstract Entities      4     2


In [8]:
news_train['Category'] = news_train['Category'].astype(str)
news_train['SubCategory'] = news_train['SubCategory'].astype(str)

click = alt.selection_multi(fields=['Category'])

# categories
category_chart = alt.Chart(news_train).mark_bar().encode(
    x=alt.X('Category:N', sort='-y'),
    y=alt.Y('count():Q'),
    color='Category:N',
    tooltip=['Category:N', 'count()'],
    opacity=alt.condition(click, alt.value(1), alt.value(0.2)) 
).add_selection(
    click
).properties(
    width=600,
    height=300,
    title='Category distribution'
)

# subcategories
subcategory_chart = alt.Chart(news_train).transform_filter(
    click 
).mark_bar().encode(
    x=alt.X('count():Q'),
    y=alt.Y('SubCategory:N', sort='-x'),
    color='Category:N',
    tooltip=['SubCategory:N', 'count()']
).properties(
    width=600,
    height=300,
    title='Subcategory distribution'
)

#concatenate charts
alt.vconcat(category_chart, subcategory_chart).configure_concat(spacing=30)


## <p style="text-align: center;"> Titles & Abstracts</p>
[Linguistic devices used in Newspaper headlines](https://www.researchgate.net/publication/364069851_Linguistic_Devices_Used_in_Newspaper_Headlines).

Assumption: "Paying attention to the headlines of the news, the reader may decide whether to read the entire article.(...)they constitute an indicator of the style and values of the news outlet"<br>
classification of reader's perception:  MCD (Membership Categorization Device) and Individualization Strategy (IS):<br> **MCD**  descriptor to categorize ppl into social groups or categories (identities) e.g. politicians, terrorist, victims. simplify complex social dynamic (issue w/ stereotipes, bias, story framing etc..) <br> **IS** descriptor of personal, unique, distinct characterization, towards individualization of the ppl.<br> 

possible label (4) MCD, IS, MCD+IS, None. via entities in title & abstract (rule-based, manually classified if low unique values vs n of entities mcd and is per sample if to be automated>@TO-DO: check n unique entities)

e.g. of MCD headlines:<br>

1.*Tech Giants* Pledge to Combat Climate Change (group affiliation)<br>
2.*Millennials* Are Changing the Workplace (social cat)<br>
3.*Americans* Demand More Renewable Energy" (national identity)<br>
<br>
e.g. of IS headlines:<br>

1.Nobel Laureate *Malala Yousafzai* Advocates for Girls' Education Worldwide (personal attr)<br>
2.CEO John Doe Apologizes for Company's Environmental Misconduct<br>
3.Refugee Turned Mayor Shares His Journey of Hope and Resilience<br>


[Gattani, Akshay. 2005. Maximum Entropy Discriminative Models for Headline Generation](https://summit.sfu.ca/_flysystem/fedora/sfu_migrate/2546/etd2783.pdf) : Types of headlines (and short summaries) can be categorized into INDICATIVE: headlines which indicate what topics are covered by the news story, INFORMATIVE: headlines which convey what particular concept, theme or event is covered in the news story and EYECATCHERS: headlines which do not inform about the content of the story but are designed to attract attention and entice people to read the story.<br>
[On newspaper headlines as relevance optimizers](https://www.researchgate.net/publication/229005694_On_Newspaper_Headlines_as_Relevance_Optimizers)--->kinda motivates 'reading time' as a feature for titles too

In [9]:
#descriptive stats for Title & Abstract(->latter has more missing values)
news_train['Abstract'] = news_train['Abstract'].fillna('')
abstract_len = news_train['Abstract'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
title_len = news_train['Title'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
print(pd.DataFrame({'Title': title_len.describe(), 'Abstract': abstract_len.describe()}))


              Title      Abstract
count  51282.000000  51282.000000
mean      10.754417     34.293319
std        3.265311     26.542819
min        1.000000      0.000000
25%        9.000000     15.000000
50%       10.000000     24.000000
75%       13.000000     62.000000
max       57.000000    474.000000


In [10]:
#add len column
news_train['AbstractLength'] = news_train['Abstract'].apply(lambda x: len(x.split()))
news_train['TitleLength'] = news_train['Title'].apply(lambda x: len(x.split()))
#drop duplicates
unique_abstract_lengths = news_train.drop_duplicates(subset=['Abstract', 'Category'])
unique_title_lengths = news_train.drop_duplicates(subset=['Title', 'Category'])
selection = alt.selection_multi(fields=['Category'], bind='legend')
title_chart = alt.Chart(unique_title_lengths).mark_bar().encode(
    x=alt.X('TitleLength:Q', title='Title Length'),
    y=alt.Y('count()', title='Number of Titles'),
    color='Category:N',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=['Category:N', 'count()', alt.Tooltip('TitleLength:Q', title='Title Length')]
).add_selection(
    selection
).properties(
    title='Histogram of Unique Title Lengths by Category',
    width=400,
    height=400
)
abstract_chart = alt.Chart(unique_abstract_lengths).mark_bar().encode(
    x=alt.X('AbstractLength:Q', title='Abstract Length'),
    y=alt.Y('count()', title='Number of Abstracts'),
    color='Category:N',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=['Category:N', 'count()', alt.Tooltip('AbstractLength:Q', title='Abstract Length')]
).add_selection(
    selection
).properties(
    title='Histogram of Unique Abstract Lengths by Category',
    width=400,
    height=400
)
combined_chart = alt.hconcat(title_chart, abstract_chart).properties(
    title='Histograms of Unique Title and Abstract Lengths by Category'
)
combined_chart.display()


Prob: on whole news body, but most urls are outdated<br>
[textstat](https://pypi.org/project/textstat/)<br>
readability: 
**[The Flesch Reading Ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease)**: higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.<br>
<img src="read_scores.png" alt="Drawing" style="width: 450px;"/><vr>
complexity: find one <br>
aggregate stat: **[Readability Consensus](https://pypi.org/project/textstat/)**: average of different statistical methods to measure readability and complexity

In [11]:
#Ease of Readability, Flesch Reading Ease
news_train['Title_Flesch_Reading_Ease'] = news_train['Title'].apply(textstat.flesch_reading_ease) #-----> does it actually make sense?
print(f"Before: {news_train.shape}")
#delete rows with missing ABSTRACT
news_train = news_train[news_train['Abstract'].notna()]
#delete abstracts with less than 3 words (S-V-O)
news_train = news_train[news_train['Abstract'].apply(lambda x: len(x.split()) > 3)]
print(f"After: {news_train.shape}")

news_train['Abstract_Flesch_Reading_Ease'] = news_train['Abstract'].apply(textstat.flesch_reading_ease) #has range (-inf; 121]
print( 'Flesh Reading stat for Abstracts:' , news_train['Abstract_Flesch_Reading_Ease'].describe(), "\n")
#print abtract of highest and lowest score
print("Easiest: ", news_train.loc[news_train['Abstract_Flesch_Reading_Ease'].idxmax()]['Abstract'], "score: ", news_train['Abstract_Flesch_Reading_Ease'].max())
print("Hardest: ", news_train.loc[news_train['Abstract_Flesch_Reading_Ease'].idxmin()]['Abstract'], "score: ", news_train['Abstract_Flesch_Reading_Ease'].min())
selection = alt.selection_multi(fields=['Category'], bind='legend')
chart = alt.Chart(news_train).mark_bar().encode(
    x=alt.X('Abstract_Flesch_Reading_Ease', bin=alt.Bin(maxbins=30), title='Flesch Reading Ease: 0 or less (difficult) - 100 (easy)'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.condition(selection,
                         'Category:N',
                         alt.value('lightgray')),
    tooltip=['Category', 'count()']
).add_selection(
    selection
).properties(
    width=600,
    height=400,
    title='ABSTRACT Distribution of Flesch Reading Ease by Category'
)

chart.display()
print( 'Flesh Reading stat for Title:' , news_train['Title_Flesch_Reading_Ease'].describe(), "\n")
print("Easiest: ", news_train.loc[news_train['Title_Flesch_Reading_Ease'].idxmax()]['Title'], "score: ", news_train['Title_Flesch_Reading_Ease'].max())
print("Hardest: ", news_train.loc[news_train['Title_Flesch_Reading_Ease'].idxmin()]['Title'], "score: ", news_train['Title_Flesch_Reading_Ease'].min())
chart = alt.Chart(news_train).mark_bar().encode(
    x=alt.X('Title_Flesch_Reading_Ease', bin=alt.Bin(maxbins=50), title='Flesch Reading Ease: 0 or less (difficult) - 100 (easy)'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.condition(selection,
                        'Category:N',
                        alt.value('lightgray')),
    tooltip=['Category', 'count()']
).add_selection(
    selection
).properties(
    width=600,
    height=400,
    title='TITLE Distribution of Flesch Reading Ease by Category'
)
chart.display()

Before: (51282, 11)
After: (48270, 11)
Flesh Reading stat for Abstracts: count    48270.000000
mean        64.688534
std         18.512011
min       -118.710000
25%         53.550000
50%         65.730000
75%         76.620000
max        119.190000
Name: Abstract_Flesch_Reading_Ease, dtype: float64 

Easiest:  The Kings win! The Kings win! score:  119.19
Hardest:  Giannis Antetokounmpo looked exasperated. score:  -118.71


Flesh Reading stat for Title: count    48270.000000
mean        67.385014
std         22.482539
min       -555.590000
25%         53.880000
50%         68.770000
75%         83.660000
max        120.210000
Name: Title_Flesch_Reading_Ease, dtype: float64 

Easiest:  S.N.O.T.: 11-9-2019 score:  120.21
Hardest:  southern_california_erupts_in_fire score:  -555.59


In [12]:
#Readability Consensus based upon different tests
news_train['Abstract_Readability_Consensus'] = news_train['Abstract'].apply(textstat.text_standard, float_output=False)
news_train['Abstract_Readability_Consensus'].describe()
#"""chart3 = alt.Chart(news_train).mark_bar().encode(
#    x=alt.X('Abstract_Readability_Consensus:O', title='Estimated School Grade Level'),
#    y=alt.Y('count()', title='Number of Articles'),
#    color=alt.condition(selection, 'Category:N', alt.value('lightgray')),
#    tooltip=['Abstract_Readability_Consensus', 'Category', 'count()']
#).add_selection(
#    selection
#).properties(
#    width=600,
#    height=400,
#    title='Distribution of Estimated School Grade Level for Abstracts by Category'
#)
#chart3.display()"""

count                 48270
unique                   38
top       8th and 9th grade
freq                   5702
Name: Abstract_Readability_Consensus, dtype: object

<img src="abstr_read_consensus.png" alt="Drawing" style="width: 600px;"/><vr>

In [13]:
#Reading Time
news_train['Abstract_Reading_Time'] = news_train['Abstract'].apply(lambda x: textstat.reading_time(x, ms_per_char=14.69)) 
print(news_train['Abstract_Reading_Time'].describe())
alt.data_transformers.enable('json')

#plot it
selection = alt.selection_multi(fields=['Category'], bind='legend')
chart = alt.Chart(news_train).mark_bar().encode(
    x=alt.X('Abstract_Reading_Time', bin=alt.Bin(maxbins=60), title='Reading Time'),
    y=alt.Y('count()', title='Number of Articles'),
    color=alt.condition(selection,
                         'Category:N',
                         alt.value('lightgray')),  # Use a neutral color if not selected
    tooltip=['Category', 'count()']
).add_selection(
    selection
).properties(
    width=600,
    height=400,
    title='Distribution of ABSTRACT Reading Times by Category'
)

chart.display()


count    48270.000000
mean         2.673108
std          1.883621
min          0.150000
25%          1.250000
50%          1.850000
75%          4.790000
max         31.230000
Name: Abstract_Reading_Time, dtype: float64


## <p style="text-align: center;"> Entity & Relationship</p>

**Entities (Title & Abstract):<br>**
1. Label:  The entity name in the Wikidata knowledge graph 

2. Type: general category e.g "P" = "Person", "L" = Location, "O" = Organization, "D/T" Date-time, "Pr"= Product, "E"=events etc. ...

3. WikidataId [**also 1st column of embedding file**]: unique identifier for the entity in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page),e.g. "Q80976" is the Wikidata ID for Prince Philip.

4. Confidence: range[0,1], confidence of the entity linking.

5. OccurrenceOffsets: character positions in the text.

6. SurfaceForms: The raw entity names in the original text

In [14]:
entity_df = news_train['Title Entities'].apply(lambda x: json.loads(x) if type(x) == str else []) #pd series
normalized_df = pd.DataFrame()
for index, entities in entity_df.items(): 
    if entities:  
        temp_df = pd.DataFrame(entities)
        temp_df['news_index'] = index
        normalized_df = pd.concat([normalized_df, temp_df], ignore_index=True)
normalized_df

Unnamed: 0,Label,Type,WikidataId,Confidence,OccurrenceOffsets,SurfaceForms,news_index
0,"Prince Philip, Duke of Edinburgh",P,Q80976,1.000,[48],[Prince Philip],0
1,"Charles, Prince of Wales",P,Q43274,1.000,[28],[Prince Charles],0
2,Elizabeth II,P,Q9682,0.970,[11],[Queen Elizabeth],0
3,Adipose tissue,C,Q193583,1.000,[20],[Belly Fat],1
4,Skin tag,C,Q3179593,1.000,[18],[Skin Tags],4
...,...,...,...,...,...,...,...
56402,First Coast,G,Q21855507,1.000,[6],[Northeast Florida],51276
56403,United States,G,Q30,0.965,[38],[U.S.],51276
56404,Woolsey Fire,N,Q58445227,1.000,[53],[Woolsey Fire],51277
56405,MLS Cup,U,Q577698,0.963,[21],[MLS Cup],51280


In [15]:
temp_df = normalized_df.applymap(lambda x: str(x) if isinstance(x, list) else x)
print(temp_df.nunique())
print("\n Distribution of:",temp_df['Type'].value_counts())
print("\n Mean Confidence",temp_df['Confidence'].mean())

Label                14517
Type                    22
WikidataId           14514
Confidence             100
OccurrenceOffsets      443
SurfaceForms         17346
news_index           35244
dtype: int64

 Distribution of: Type
O    14613
P    14430
G    11977
U     3571
C     3075
N     1633
F     1559
W      890
S      808
E      671
M      592
H      498
B      497
L      474
V      456
J      272
K      155
Y      109
R       74
Q       18
I       18
A       17
Name: count, dtype: int64

 Mean Confidence 0.9928903150318223


In [16]:
entity_df = news_train['Abstract Entities'].apply(lambda x: json.loads(x) if type(x) == str else []) #pd series
normalized_df = pd.DataFrame()
for index, entities in entity_df.items(): 
    if entities:  
        temp_df = pd.DataFrame(entities)
        temp_df['news_index'] = index
        normalized_df = pd.concat([normalized_df, temp_df], ignore_index=True)
normalized_df

Unnamed: 0,Label,Type,WikidataId,Confidence,OccurrenceOffsets,SurfaceForms,news_index
0,Adipose tissue,C,Q193583,1.000,[97],[belly fat],1
1,Ukraine,G,Q212,0.946,[87],[Ukraine],2
2,National Basketball Association,O,Q155223,1.000,[40],[NBA],3
3,Skin tag,C,Q3179593,1.000,[105],[Skin Tags],4
4,Dermatology,C,Q171171,1.000,[131],[Dermatologist],4
...,...,...,...,...,...,...,...
95269,United States women's national soccer team,O,Q334526,1.000,"[9, 258]","[U.S. women's national soccer team, U.S. women...",51276
95270,TIAA Bank Field,N,Q635117,1.000,[135],[TIAA Bank Field],51276
95271,"Jacksonville, Florida",G,Q16568,1.000,[54],[Jacksonville],51276
95272,Costa Rica,G,Q800,0.991,"[159, 341]","[Costa Rica, Costa Rica]",51276


In [17]:
temp_df = normalized_df.applymap(lambda x: str(x) if isinstance(x, list) else x)
print(temp_df.nunique())
print("\n Distribution of:",temp_df['Type'].value_counts())
print("\n Mean Confidence",temp_df['Confidence'].mean())

Label                23821
Type                    22
WikidataId           23814
Confidence             100
OccurrenceOffsets    11172
SurfaceForms         33951
news_index           37421
dtype: int64

 Distribution of: Type
G    24560
O    22893
P    19539
U     6245
C     3377
M     3264
F     3188
N     2870
S     2352
W     1400
E     1035
L     1028
B      957
H      624
K      624
V      519
J      369
Y      219
R      112
Q       42
A       34
I       23
Name: count, dtype: int64

 Mean Confidence 0.9931611772361819


In [18]:
print("-"*200, "\n entity_train embedding shape: ", entity_train.shape)
display(entity_train.head())
print("-"*200, "\n relation_train embedding shape: ", relation_train.shape)
display(relation_train.head()) #not fetchable

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
 entity_train embedding shape:  (26904, 102)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,Q41,-0.063388,-0.181451,0.057501,-0.091254,-0.076217,-0.052525,0.0505,-0.224871,-0.018145,...,0.001861,0.124535,-0.151043,-0.263698,-0.103607,0.020007,-0.101157,-0.091567,0.035234,
1,Q1860,0.060958,0.069934,0.015832,0.079471,-0.023362,-0.125007,-0.043618,0.134063,-0.121691,...,-0.014287,0.013578,0.099977,0.012199,-0.141138,0.056129,-0.133727,0.025795,0.051448,
2,Q39631,-0.093106,-0.052002,0.020556,-0.020801,0.04318,-0.072321,0.00091,0.028156,0.176303,...,-0.08684,-0.078992,-0.062712,0.051117,-0.184307,0.127637,-0.144866,0.04469,0.013498,
3,Q30,-0.115737,-0.179113,0.102739,-0.112469,-0.101853,-0.177516,0.01586,-0.092626,0.086708,...,0.080511,-8.5e-05,-0.089968,-0.083486,-0.149992,-0.053031,-0.136071,-0.029001,0.174155,
4,Q60,-0.051036,-0.165637,0.132802,-0.089949,-0.146637,-0.142246,0.103853,-0.129651,0.096265,...,0.078628,0.003711,-0.058953,-0.154067,-0.117159,-0.031614,-0.140451,0.001288,0.14035,


-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
 relation_train embedding shape:  (1091, 102)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,P31,-0.073467,-0.132227,0.034173,-0.032769,0.008289,-0.107088,-0.031712,-0.039581,0.101882,...,-0.050068,-0.053686,-0.045389,-0.037017,0.11719,-0.063597,-0.05691,0.058387,-0.114056,
1,P21,-0.078436,0.108589,-0.049429,-0.131355,0.0493,-0.094605,-0.101469,0.127802,-0.081245,...,0.074341,-0.030571,-0.137183,0.045598,-0.151155,-0.066223,0.057489,0.130188,-0.054801,
2,P106,-0.052137,0.052444,-0.019886,-0.152309,0.014144,-0.180491,-0.132198,0.063082,0.085229,...,-0.058958,-0.032021,-0.147213,0.082776,-0.169705,0.122445,-0.054737,0.055321,0.070961,
3,P735,-0.051398,0.056219,0.068029,-0.137717,-0.03005,0.061566,-0.103184,-0.074124,-0.118975,...,-0.092234,0.05687,0.01364,0.042696,0.013683,-0.021127,-0.189257,0.055315,0.101863,
4,P108,0.091231,0.022526,0.059349,-0.141853,0.035025,-0.11104,-0.127337,0.047645,-0.172328,...,-0.046994,-0.056248,-0.146538,0.121375,-0.211757,0.077591,-0.0022,-0.05388,0.140873,


# Behaviours

In [35]:
behavior_train.head(3)

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...


In [20]:
print(pd.DataFrame({'Train': behavior_train.nunique(), 'Test': behavior_test.nunique()}))
#impression = n rows

                Train   Test
Impression ID  156965  73152
User ID         50000  50000
Time           125501  40320
History         48868  48353
Impressions    143617  67949


In [21]:
#missing values
print(pd.DataFrame({'Train': behavior_train.isna().sum(), 'Test': behavior_test.isna().sum()}))

               Train  Test
Impression ID      0     0
User ID            0     0
Time               0     0
History         3238  2214
Impressions        0     0


In [37]:
behavior_train['History'] = behavior_train['History'].astype(str).str.split()

In [41]:
exploded_df = behavior_train.explode('History', ignore_index=True)[['User ID', 'History']]
exploded_df.rename(columns={'History': 'News ID'}, inplace=True)
merged_df = exploded_df.merge(news_train[['News ID', 'Category', 'SubCategory']], on='News ID', how='left')
merged_df

Unnamed: 0,User ID,News ID,Category,SubCategory
0,U13740,N55189,tv,tvnews
1,U13740,N42782,sports,baseball_mlb
2,U13740,N34694,tv,tvnews
3,U13740,N45794,news,newscrime
4,U13740,N18445,sports,football_ncaa
...,...,...,...,...
5110872,U44625,N43083,lifestyle,lifestylehomeandgarden
5110873,U44625,N9288,news,newsworld
5110874,U44625,N37863,news,newsus
5110875,U64800,N22997,news,newscrime


In [65]:
clicks_per_user = merged_df.groupby('User ID').size().reset_index(name='Number of Clicks')
clicks_distribution_chart = alt.Chart(clicks_per_user).mark_bar().encode(
    x=alt.X('Number of Clicks:Q', bin=alt.Bin(maxbins=500), title='Number of Clicks per User', scale=alt.Scale(domain=[0, 4800])),
    y=alt.Y('count()', title='Number of Users'),
    tooltip=[alt.Tooltip('count()', title='Number of Users'), alt.Tooltip('Number of Clicks', title='Clicks per User')]
).properties(
    height=400,
    width=600,
    title='Distribution of Clicks per User (0-4800 Range)'
)

clicks_distribution_chart.display()

#mean click per users
mean_clicks = clicks_per_user['Number of Clicks'].mean()
print(f"Mean number of clicks per user: {mean_clicks:.2f}")



Mean number of clicks per user: 102.22


In [47]:
click_distribution = merged_df['Category'].value_counts().reset_index()
click_distribution.columns = ['Category', 'Clicks']
chart = alt.Chart(click_distribution).mark_bar().encode(
    x='Category',
    y='Clicks',
    color='Category'
).properties(
    title='Click Distribution by Category'
)
chart

In [49]:
click = alt.selection_multi(fields=['Category'])
category_chart = alt.Chart(merged_df).mark_bar().encode(
    x=alt.X('Category:N', sort='-y'),
    y=alt.Y('count():Q', title='Number of Clicks'),
    color='Category:N',
    tooltip=['Category:N', 'count()'],
    opacity=alt.condition(click, alt.value(1), alt.value(0.2))
).add_selection(
    click
).properties(
    width=600,
    height=300,
    title='Category Distribution'
)
subcategory_chart = alt.Chart(merged_df).transform_filter(
    click
).mark_bar().encode(
    x=alt.X('count():Q', title='Number of Clicks'),
    y=alt.Y('SubCategory:N', sort='-x'),
    color='Category:N',
    tooltip=['SubCategory:N', 'count()']
).properties(
    width=600,
    height=300,
    title='Subcategory Distribution'
)
alt.vconcat(category_chart, subcategory_chart).configure_concat(spacing=30)
