In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
news_header = ["news_id", "category", "sub_category","title","abstract", "url", "title_entities", "abstract_entities"]

news = pd.read_csv("news.tsv", sep="\t", names=news_header)

In [3]:
news_df = pd.DataFrame(news.values,index=news["news_id"].values, columns=news.columns.values).drop(['news_id', 'title_entities','abstract_entities', 'abstract'], axis=1)

In [4]:
behaviors = pd.read_csv("titlesbehaviours.csv")

In [5]:
unique_users = behaviors.drop_duplicates(subset = "user_id").sample(frac = 0.001)
unique_users = unique_users[unique_users['history'].notna()]

In [7]:
joined = unique_users.merge(news, how='cross').drop(["category","sub_category","url","title_entities","abstract_entities", "abstract", "Unnamed: 0"], axis=1)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval

vectorizer = TfidfVectorizer()

def score(row):
    if row["news_id"] in row["history"]:
        return 1
    X = cosine_similarity(vectorizer.fit_transform(literal_eval(row["history_titles"])+[row["title"]]).toarray())[-1]
    return sum(X[:-1])/(len(X)-1)

joined["score"] = joined.apply(score, axis=1)

In [21]:
R_df = joined.pivot_table(index = 'user_id', columns ='news_id', values = 'score')

In [23]:
model = NMF(n_components=40, init='nndsvda')

W = model.fit_transform(R_df)
H = model.components_ 



In [24]:
new = np.matmul(W,H)

In [25]:
df = pd.DataFrame(new, index=R_df.index.values, columns = R_df.columns.values)

In [36]:
count = 0
for i in df.iloc[1]:
    if i>0.9:
        count+=1
        print(i)
print(count)

0.985323511627956
0.9858215358451384
1.019123557120798
0.9863329816308655
0.9813040046252405
0.9815411557895473
0.9842928082912322
1.0374768393928888
1.0701464376870435
0.9744646339985353
0.9721259187522645
0.9846980713853494
0.9594499778579965
0.9802402013224325
0.9831042069743203
0.9814746020392123
0.9813291724152713
0.9690092541106888
0.9706756626743713
0.984877972357674
0.9856420158342816
0.987793201054498
0.9835514293143894
0.9813164371278404
0.9862941568637544
0.9847569046765451
0.9031144996167524
0.9830059399832497
0.9814477635943032
0.9830578156803341
0.985698542942566
1.0481912153322048
1.0705355936823966
0.9814081583091219
0.9535780156427793
0.9812902206411931
0.9811857646701238
0.9811605945634572
38


In [34]:
df

Unnamed: 0,N10,N100,N1000,N10000,N10001,N10002,N10003,N10004,N10005,N10007,...,N9989,N999,N9990,N9991,N9992,N9993,N9994,N9997,N9998,N9999
U10297,8e-05,0.001227,0.005738,0.038044,0.018259,0.009623,0.003292,0.006606,0.00108,0.003321,...,0.012481,0.002024,0.00132,0.005303,0.010684,0.017031,0.031,0.007299,0.028685,0.005503
U11354,0.007558,0.013649,0.007945,0.011557,0.00795,0.009593,0.016577,0.008821,0.004081,0.014204,...,0.013545,0.010608,0.039436,0.022455,0.031621,0.02888,0.015298,0.031477,0.017977,0.020167
U14021,0.029828,0.009378,0.009114,0.060257,0.023661,0.015569,0.010441,0.010534,0.001379,0.010802,...,0.022268,0.006695,0.003301,0.007997,0.018687,0.027895,0.047993,0.010423,0.044937,0.010224
U15119,0.462815,0.044377,0.018839,0.056332,0.032392,0.0239,0.001611,0.014932,0.008487,0.001442,...,0.001394,0.003297,0.017988,0.001633,0.010164,0.002993,0.064878,4.5e-05,0.039098,3.9e-05
U19360,0.007851,0.009753,0.003587,0.033011,0.050639,0.011312,0.000438,0.003615,0.014487,0.000724,...,0.000665,0.004407,0.002401,0.010186,0.00188,0.012209,0.032557,0.001145,0.009631,0.001082
U21940,0.009359,0.015049,0.008034,0.020251,0.021629,0.010162,0.014918,0.006931,0.016526,0.009234,...,0.017853,0.01275,0.012933,0.013185,0.015581,0.019577,0.013372,0.006472,0.02051,0.005949
U22899,0.997311,0.99893,0.006756,0.02224,0.018087,0.01921,0.010664,0.007769,0.019563,0.014701,...,0.009105,0.02076,0.011323,0.010638,0.016256,0.022539,0.020336,0.007949,0.017833,0.004908
U25217,0.010502,0.009497,0.005133,0.03211,0.011219,0.010151,0.010049,0.005843,0.006279,0.012731,...,0.012046,0.022435,0.011446,0.01023,0.016242,0.027567,0.016962,0.000633,0.019252,0.00059
U26186,0.999998,0.004642,0.008091,0.050215,0.034501,0.011893,0.005625,0.006466,0.006027,0.00709,...,0.004838,0.012334,0.015356,0.014561,0.009485,0.019283,0.032392,6.5e-05,0.028911,0.000122
U26471,0.009221,0.021727,7.5e-05,0.040974,0.03407,0.016392,0.012601,8.2e-05,0.005771,0.011186,...,0.010813,0.006686,0.012114,0.029388,0.023112,0.042044,0.026406,0.000204,0.02543,0.000242
