# Import libraries

In [67]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
from random import sample
import datetime
import pandasql as ps

# Load in data

In [68]:
# Loading in behaviour data
behaviour_data = pd.read_csv('train/behaviors.tsv', delimiter="\t", header=None)

# Change the names of the columns
behaviour_data.columns = ['Impression_id', 'User_id', 'Time', 'History', 'Impression']

#  Convert string to list for History and Impression column 
behaviour_data.History = behaviour_data.History.map(lambda x: x.split(' ') if isinstance(x, str)  else x)
behaviour_data.Impression = behaviour_data.Impression.map(lambda x: x.split(' ') if isinstance(x, str)  else x)
behaviour_data

Unnamed: 0,Impression_id,User_id,Time,History,Impression
0,1,U13740,11/11/2019 9:05:58 AM,"[N55189, N42782, N34694, N45794, N18445, N6330...","[N55689-1, N35729-0]"
1,2,U91836,11/12/2019 6:11:30 PM,"[N31739, N6072, N63045, N23979, N35656, N43353...","[N20678-0, N39317-0, N58114-0, N20495-0, N4297..."
2,3,U73700,11/14/2019 7:01:48 AM,"[N10732, N25792, N7563, N21087, N41087, N5445,...","[N50014-0, N23877-0, N35389-0, N49712-0, N1684..."
3,4,U34670,11/11/2019 5:28:05 AM,"[N45729, N2203, N871, N53880, N41375, N43142, ...","[N35729-0, N33632-0, N49685-1, N27581-0]"
4,5,U8125,11/12/2019 4:11:21 PM,"[N10078, N56514, N14904, N33740]","[N39985-0, N36050-0, N16096-0, N8400-1, N22407..."
...,...,...,...,...,...
156960,156961,U21593,11/14/2019 10:24:05 PM,"[N7432, N58559, N1954, N43353, N14343, N13008,...","[N2235-0, N22975-0, N64037-0, N47652-0, N11378..."
156961,156962,U10123,11/13/2019 6:57:04 AM,"[N9803, N104, N24462, N57318, N55743, N40526, ...","[N3841-0, N61571-0, N58813-0, N28213-0, N4428-..."
156962,156963,U75630,11/14/2019 10:58:13 AM,"[N29898, N59704, N4408, N9803, N53644, N26103,...","[N55913-0, N62318-0, N53515-0, N10960-0, N9135..."
156963,156964,U44625,11/13/2019 2:57:02 PM,"[N4118, N47297, N3164, N43295, N6056, N38747, ...","[N6219-0, N3663-0, N31147-0, N58363-0, N4107-0..."


In [69]:
# Loading in news data
news_data = pd.read_csv('train/news.tsv', delimiter="\t", header=None)

# Change column names 
news_data.columns = ['News_id', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title_entities', 'Abstract_entities']

#Drop unnecessary columns
news_data = news_data.drop(['URL', 'Title_entities', 'Abstract_entities'], axis = 1)
news_data

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b..."
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ..."


## Subset data

Because the dataset is so big, we need to subset the data to decrease runtime

In [70]:
# Get 1/4 of the data 
sub_behaviour_data = behaviour_data.loc[1:(len(behaviour_data)/4),]
sub_behaviour_data

Unnamed: 0,Impression_id,User_id,Time,History,Impression
1,2,U91836,11/12/2019 6:11:30 PM,"[N31739, N6072, N63045, N23979, N35656, N43353...","[N20678-0, N39317-0, N58114-0, N20495-0, N4297..."
2,3,U73700,11/14/2019 7:01:48 AM,"[N10732, N25792, N7563, N21087, N41087, N5445,...","[N50014-0, N23877-0, N35389-0, N49712-0, N1684..."
3,4,U34670,11/11/2019 5:28:05 AM,"[N45729, N2203, N871, N53880, N41375, N43142, ...","[N35729-0, N33632-0, N49685-1, N27581-0]"
4,5,U8125,11/12/2019 4:11:21 PM,"[N10078, N56514, N14904, N33740]","[N39985-0, N36050-0, N16096-0, N8400-1, N22407..."
5,6,U19739,11/11/2019 6:52:13 PM,"[N39074, N14343, N32607, N32320, N22007, N442,...","[N21119-1, N53696-0, N33619-1, N25722-0, N2869-0]"
...,...,...,...,...,...
39237,39238,U76081,11/10/2019 12:49:55 PM,"[N39074, N18109, N59742, N47685, N41373, N3351...","[N64513-0, N57614-0, N61185-0, N61022-0, N1452..."
39238,39239,U66612,11/13/2019 7:27:30 AM,"[N54496, N51706, N4020, N18285, N4764, N19280,...","[N42143-0, N19444-0, N41122-0, N59673-0, N4870..."
39239,39240,U9402,11/13/2019 7:46:03 AM,"[N50566, N46039, N25792, N36602, N46987, N2600...","[N36659-0, N10812-0, N64632-0, N5364-0, N28213..."
39240,39241,U20770,11/14/2019 3:05:59 AM,"[N55846, N21383, N43369, N33617, N28296, N3399...","[N48017-0, N3167-0, N40109-0, N40559-0, N16148..."


## Get clicked article


In [103]:
# === 
# Function to get articles that are clicked  from the impression column 
# As 0 means not clicked and 1 means clicked, we need to get the articles with '1' at the end of the id 
#We split the id by '-' so we get id and the number, if the number is 1, it means it is clicked anddwe return the id
#Input: list of articles 
#Output: list of clicked articles
# ===

def get_clicked_article(row):
    clicked_list = []
    for article in row: 
        splitted = article.split('-')
        if splitted[1] == '1':
            clicked_list.append(splitted[0])
    return clicked_list

In [73]:
# Apply the function on the impression column 
sub_behaviour_data.loc[:, 'Impression'] = sub_behaviour_data.loc[:, 'Impression'].map(lambda row: get_clicked_article(row))
sub_behaviour_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,Impression_id,User_id,Time,History,Impression
1,2,U91836,11/12/2019 6:11:30 PM,"[N31739, N6072, N63045, N23979, N35656, N43353...",[N17059]
2,3,U73700,11/14/2019 7:01:48 AM,"[N10732, N25792, N7563, N21087, N41087, N5445,...",[N23814]
3,4,U34670,11/11/2019 5:28:05 AM,"[N45729, N2203, N871, N53880, N41375, N43142, ...",[N49685]
4,5,U8125,11/12/2019 4:11:21 PM,"[N10078, N56514, N14904, N33740]",[N8400]
5,6,U19739,11/11/2019 6:52:13 PM,"[N39074, N14343, N32607, N32320, N22007, N442,...","[N21119, N33619]"
...,...,...,...,...,...
39237,39238,U76081,11/10/2019 12:49:55 PM,"[N39074, N18109, N59742, N47685, N41373, N3351...",[N26706]
39238,39239,U66612,11/13/2019 7:27:30 AM,"[N54496, N51706, N4020, N18285, N4764, N19280,...",[N27437]
39239,39240,U9402,11/13/2019 7:46:03 AM,"[N50566, N46039, N25792, N36602, N46987, N2600...","[N31448, N36624]"
39240,39241,U20770,11/14/2019 3:05:59 AM,"[N55846, N21383, N43369, N33617, N28296, N3399...",[N23877]


# 4) Trending videos: Recommendation based on latest most viewed news

## Order data by datetime 

In [74]:
# ===
# We don't need the seconds. Furthermore,  it disrupts the sorting somehow so we delete it
# Input: string time 
# Output string time (cleaned)
# ===
def get_date(time):
    splitted_time = time.split(' ')
    return splitted_time[0]

In [75]:
# Apply function on time column 
sub_behaviour_data.Time = sub_behaviour_data.Time.apply(lambda time: get_date(time))
sub_behaviour_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,Impression_id,User_id,Time,History,Impression
1,2,U91836,11/12/2019,"[N31739, N6072, N63045, N23979, N35656, N43353...",[N17059]
2,3,U73700,11/14/2019,"[N10732, N25792, N7563, N21087, N41087, N5445,...",[N23814]
3,4,U34670,11/11/2019,"[N45729, N2203, N871, N53880, N41375, N43142, ...",[N49685]
4,5,U8125,11/12/2019,"[N10078, N56514, N14904, N33740]",[N8400]
5,6,U19739,11/11/2019,"[N39074, N14343, N32607, N32320, N22007, N442,...","[N21119, N33619]"
...,...,...,...,...,...
39237,39238,U76081,11/10/2019,"[N39074, N18109, N59742, N47685, N41373, N3351...",[N26706]
39238,39239,U66612,11/13/2019,"[N54496, N51706, N4020, N18285, N4764, N19280,...",[N27437]
39239,39240,U9402,11/13/2019,"[N50566, N46039, N25792, N36602, N46987, N2600...","[N31448, N36624]"
39240,39241,U20770,11/14/2019,"[N55846, N21383, N43369, N33617, N28296, N3399...",[N23877]


In [105]:
# Convert time to a datetime type 
sub_behaviour_data.Time = pd.to_datetime(sub_behaviour_data.Time)

# Sort the dataframe based on time, the latest clicked articles are on top
sorted_behaviour_data = sub_behaviour_data.sort_values(by='Time', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [77]:
sorted_behaviour_data

Unnamed: 0,Impression_id,User_id,Time,History,Impression
19621,19622,U15038,2019-11-14,"[N10625, N36751, N9803, N56776, N48532, N36424]",[N40109]
6589,6590,U28449,2019-11-14,"[N13537, N57024, N5310, N49914, N26618, N43945...",[N16311]
14401,14402,U63564,2019-11-14,"[N53074, N56447, N28081, N35022, N30822, N6439...",[N19661]
24822,24823,U53994,2019-11-14,"[N12900, N29499, N26122, N42620, N8073, N8148]",[N45734]
24823,24824,U35169,2019-11-14,"[N26085, N53388, N22161, N38585, N35703, N5917...","[N6837, N34185]"
...,...,...,...,...,...
33235,33236,U55283,2019-11-09,"[N56948, N39556, N55829, N17587, N10128, N3173...","[N41178, N25764]"
2925,2926,U14946,2019-11-09,"[N54896, N28108, N44090, N50306, N61864, N5570...",[N3491]
24816,24817,U55084,2019-11-09,"[N20263, N22279, N62496, N22913, N48233, N1150...",[N41881]
22376,22377,U70938,2019-11-09,"[N47106, N47121, N4434, N28550, N30450, N59984...",[N44679]


## Get the latest news 

In [108]:
# Split data based on latest date (Assuming latest date is present time)
# Latest date = latest news
latest_behaviour_data = sorted_behaviour_data[sorted_behaviour_data.Time > '2019-11-13']
latest_behaviour_data

Unnamed: 0,Impression_id,User_id,Time,History,Impression
19621,19622,U15038,2019-11-14,"[N10625, N36751, N9803, N56776, N48532, N36424]",[N40109]
6589,6590,U28449,2019-11-14,"[N13537, N57024, N5310, N49914, N26618, N43945...",[N16311]
14401,14402,U63564,2019-11-14,"[N53074, N56447, N28081, N35022, N30822, N6439...",[N19661]
24822,24823,U53994,2019-11-14,"[N12900, N29499, N26122, N42620, N8073, N8148]",[N45734]
24823,24824,U35169,2019-11-14,"[N26085, N53388, N22161, N38585, N35703, N5917...","[N6837, N34185]"
...,...,...,...,...,...
35673,35674,U67250,2019-11-14,"[N56253, N4304, N871, N24421, N12907, N35657, ...","[N23446, N47098, N47572]"
22547,22548,U91388,2019-11-14,"[N3500, N56586, N9120, N37811, N19495, N18870]",[N48487]
19276,19277,U15502,2019-11-14,"[N21815, N57886, N7242, N33767, N7242, N14761,...","[N41432, N41934, N30290]"
35641,35642,U61848,2019-11-14,"[N35009, N42989, N55846, N55189, N21623, N2111...",[N64095]


## Get the clicks per news

In [79]:
# Count how many times every news article is clicked
clicked_count = latest_behaviour_data.Impression.apply(lambda x: pd.Series(x).value_counts()).sum()
clicked_count = clicked_count.to_frame()
clicked_count.columns = ['count']
clicked_count['id'] = clicked_count.index
clicked_count

Unnamed: 0,count,id
N40109,143.0,N40109
N16311,2.0,N16311
N19661,222.0,N19661
N45734,18.0,N45734
N34185,193.0,N34185
...,...,...
N21375,1.0,N21375
N38381,1.0,N38381
N32158,1.0,N32158
N10363,1.0,N10363


In [80]:
# Link the clicked count dataframe to the news_data dataframe by inner joining it: 
# This way we are able to get the count column in the news_data
link_count_query = 'select news_data.*, clicked_count.count FROM news_data INNER JOIN clicked_count ON clicked_count.id = news_data.News_id'
counted_news_data = ps.sqldf(link_count_query, locals())
counted_news_data

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count
0,N41387,tv,tv-gallery,Can you answer these real Jeopardy questions a...,"Culling data straight from the ""Jeopardy!"" arc...",59.0
1,N30290,foodanddrink,restaurantsandnews,The Real Reason McDonald's Keeps the Filet-O-F...,It's something of an underdog story.,41.0
2,N27063,lifestyle,lifestyledidyouknow,15 Abandoned Places That You Can See Only in P...,Long before the advent of the internet and Goo...,1.0
3,N4021,lifestyle,lifestyledidyouknow,9 Amazing Transgender Women Who Changed History,,90.0
4,N61483,lifestyle,lifestyleparenting,12 Photos Show What It's Really Like To Grow U...,,26.0
...,...,...,...,...,...,...
1233,N32774,lifestyle,shop-holidays,25 Stocking Stuffers For $25 or Less That Anyo...,You can fill up their stockings without breaki...,2.0
1234,N19117,movies,movienews,Clint Eastwood Isn't Going to Stop Working for...,Smoke? Flames? In this deadline-driven economy?,1.0
1235,N17031,tv,tvvideos,Lamar Odom is Engaged to Girlfriend Sabrina Pa...,The former basketball star revealed the news o...,11.0
1236,N7618,autos,autosnews,Ford v Ferrari: the real story,The film about the epic Le Mans rivalry promis...,33.0


## Get top trending news from a variety of categories



In [81]:
# === 
# Function to get recently most clicked news 
# We check the latest trending news based on the corresponding category 
# Example: if you search for top news finance, you will get the most clicked finance news
# Input: String category 
# Output: dataframe news_data 
# ===
def get_top_news(topic):
    counted_cat_data = counted_news_data[counted_news_data.Category == topic]
    counted_cat_data = counted_cat_data.sort_values(by='count', ascending=False)
    counted_cat_data['count_percentage'] = round(counted_cat_data.loc[:, 'count'] / sum(counted_cat_data['count'].values), 2)
    return counted_cat_data.head(10)

### News category

In [82]:
news = get_top_news('news')
news

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count,count_percentage
1101,N38779,news,newsus,'One in a million' deer captured on camera in ...,Retired Michigan lawmaker and amateur photogra...,355.0,0.12
1174,N19661,news,newscrime,"Judge calls USC dad a 'thief,' gives longest p...","Toby MacFarlane, a California real estate exec...",222.0,0.07
619,N56211,news,newsus,University of Florida student president faces ...,Trump Jr. and his girlfriend Kimberly Guilfoyl...,158.0,0.05
252,N23877,news,newscrime,Car of Marine linked to Virginia slaying is fo...,A car belonging to a U.S. Marine who allegedly...,149.0,0.05
634,N48017,news,newsscienceandtechnology,Experts crack mystery of ancient Egypt's sacre...,DNA analysis helps work out origin of nearly 6...,114.0,0.04
939,N49712,news,newsgoodnews,Wisconsin bus driver saves children wandering ...,As bitter cold took hold across the Midwest ea...,106.0,0.04
170,N64174,news,newscrime,Georgia executes man convicted of killing conv...,Georgia executes man convicted of killing conv...,87.0,0.03
339,N6578,news,newsworld,Small Canadian island irate that Americans kee...,The population of a tiny Canadian island off t...,80.0,0.03
664,N16439,news,newsworld,American woman apparently strangled in the Dom...,Colleagues say Patricia Anton brought joy and ...,62.0,0.02
541,N32641,news,newsworld,U.S. Vows to Defend South Korea With Full Mili...,North Korea has lodged multiple protests over ...,62.0,0.02


In [83]:
sports = get_top_news('sports')
sports

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count,count_percentage
156,N47652,sports,football_nfl,Richard Sherman tells NFL players to save thei...,,108.0,0.11
642,N51570,sports,basketball_nba,Hernandez: Anthony Davis' impact on LeBron Jam...,Never was Anthony Davis' value to the Lakers a...,86.0,0.08
895,N61740,sports,football_nfl,Browns cut WR Antonio Callaway hours before ki...,,35.0,0.03
184,N20676,sports,football_nfl,Six pretenders entering the 2019 NFL home stretch,Thanksgiving is just around the corner and the...,32.0,0.03
530,N60750,sports,football_nfl,"Browns, Steelers brawl at end of Cleveland's 2...",Cleveland defense end Myles Garrett ripped off...,29.0,0.03
270,N36003,sports,baseball_mlb,"Report: Red Sox manager Alex Cora, Mets' Carlo...","On Tuesday, The Athletic reported that the Hou...",27.0,0.03
588,N7494,sports,football_nfl,Insiders predict: NFL Week 11 winners,Power dynamics have been called into question ...,26.0,0.03
309,N57818,sports,baseball_mlb,"Sign-stealing penalties could be ""unlike anyth...",A new report suggests MLB could bring the hamm...,24.0,0.02
595,N59685,sports,football_nfl,Ranking the 11 most disappointing NFL teams of...,,23.0,0.02
225,N61787,sports,basketball_nba,Russell Westbrook takes shot at Patrick Beverley,Wednesday night's Houston Rockets-LA Clippers ...,22.0,0.02


### Weather

In [84]:
weather = get_top_news('weather')
weather

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count,count_percentage
797,N61233,weather,weathertopstories,Nor'easter to grind at US East Coast this weekend,"High winds, heavy surf and drenching rain will...",198.0,0.54
656,N40109,weather,weathertopstories,This Arctic blast is in its final day. But the...,The hundreds of millions of people gripped by ...,143.0,0.39
904,N56214,weather,weathertopstories,Deadly Arctic blast breaks records set more th...,It was literally freezing in Florida and Alaba...,10.0,0.03
29,N16311,weather,weathertopstories,U.S. cities with the dirtiest air,Stacker researched and ranked the 50 American ...,2.0,0.01
396,N53386,weather,weathertopstories,"Miami Weather: Cloudy, Wet Weather Will Contin...",South Florida got off to soggy start Thursday ...,2.0,0.01
821,N47938,weather,weathertopstories,Long Range Winter Forecast For Metro Detroit: ...,"Pouring over maps and charts, Accuweather's Pa...",2.0,0.01
213,N3216,weather,weathertopstories,Researchers comb Warren County ranch in search...,"WARREN COUNTY, Mo. (KMOV.com) -- The search is...",1.0,0.0
245,N3116,weather,weathertopstories,"50th anniversary of Apollo 12, the Moon missio...",Nov. 14 marks the 50th anniversary of Apollo 1...,1.0,0.0
263,N9887,weather,weathertopstories,'Worst that can happen is I get killed': Local...,"FRANKLIN, Ind. -- A horrific work accident lef...",1.0,0.0
286,N16806,weather,weathertopstories,White pelicans migrating south for the winter,These white pelicans were spotted near St. Pet...,1.0,0.0


### Health

In [85]:
health = get_top_news('health')
health

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count,count_percentage
434,N35815,health,health-news,"LASIK eye surgery should be taken off market, ...",An estimated 20 million Americans have undergo...,94.0,0.17
1054,N37204,health,health-news,"Model, 23, Didn't Know She Was Pregnant Until ...",An Australian model recently had a life-changi...,61.0,0.11
982,N42670,health,health-news,Cannabis Use Disorder is Rising in U.S. States...,Cannabis use disorder is where a person's use ...,53.0,0.1
190,N28072,health,health-news,He was a member of a boy band in the 90s. Now ...,Just when he attained everything he had ever w...,41.0,0.07
386,N55949,health,health-news,Marcia Cross' anal cancer may have been linked...,Marcia Cross said that getting an HPV-related ...,40.0,0.07
1142,N20576,health,ads-lung-health,Surgeon Shocked After Doing Double Lung Transp...,The surgeon who led the team that performed th...,19.0,0.03
46,N12627,health,fitness,6 Reasons to Run On the Treadmill Instead of O...,Two words: heat and humidity.,18.0,0.03
678,N48416,health,voices,"When I Speak Out Against Fat Shaming, I'm Told...",Weight loss isn't a solution to pervasive soci...,15.0,0.03
814,N23089,health,health-news,"Between 1,000 to 2,000 people get the plague e...",The plague is fairly uncommon but not eradicat...,12.0,0.02
393,N30454,health,medical,11 Symptoms of Shingles You Might Be Ignoring,"Symptoms of shingles sometimes, but not always...",12.0,0.02


### Finance

In [86]:
finance = get_top_news('finance')
finance

Unnamed: 0,News_id,Category,SubCategory,Title,Abstract,count,count_percentage
224,N6477,finance,finance-companies,Walmart releases Black Friday ad with $129 App...,Walmart's Black Friday 2019 ad includes a 40-i...,174.0,0.19
762,N36226,finance,finance-retirement,The 1 reason you shouldn't hesitate to claim S...,Seniors are often told to wait on Social Secur...,114.0,0.12
1193,N8015,finance,markets,Powell's Warning to Congress About the Next Re...,The Fed will need help stimulating the economy...,72.0,0.08
657,N47572,finance,markets,A Trump Tax Break To Help The Poor Went To a R...,The Trump tax law gave governors the authority...,42.0,0.05
271,N21077,finance,finance-companies,Sears is laying off hundreds of corporate empl...,Sears gathered employees in a room at a San Fr...,32.0,0.03
766,N56711,finance,finance-companies,"American Outdoor to split into two firms, sepa...",American Outdoor Brands Corp. said it will spl...,27.0,0.03
784,N41717,finance,personalfinance,Olympic swimmer Ryan Lochte went from earning ...,After two suspensions and losing multiple spon...,25.0,0.03
556,N4156,finance,finance-retirement,3 pieces of dated retirement advice you should...,Cookie-cutter rules aren't going to cut it for...,24.0,0.03
1024,N64543,finance,finance-savemoney,9 signs you've shifted from frugal to cheap,Your frugal ways used to be endearing to your ...,24.0,0.03
540,N42698,finance,personalfinance,Most Americans say they are struggling financi...,"Income is not keeping pace with expenses, expe...",22.0,0.02



# Content based: Article similarity 
Given what the user just read, you would like to give the user a 'You may like to read'

In [87]:
news_data['merge'] = np.nan

### Combine all the relevant text of the news

In [107]:
# === 
# Function to merge subcategory title and abstract into one column for cosine similarity 
# Input: string news_id
# Output: string merged text
# ===

def merge_feature(news_id):
    subCategory = news_data['SubCategory'][news_data['News_id']==news_id]
    title = news_data['Title'][news_data['News_id']==news_id]
    abstract = news_data['Abstract'][news_data['News_id']==news_id]
    merged_value = category + subCategory + title + abstract
    merged_value = ''.join(map(str, merged_value))
    return(merged_value)



### We get runtime error because the dataset is too big, so we subset it beforehand based on category.
### For the real recommender system we would not subset the dataset. Instead, we would use the whole data

In [95]:
# ===
# Function to get news title based on index 
# Input: numeric index , dataframe data
# Output string news title 
# ===
def get_title_from_index(index, data):
    return data[data.Index_id == index]["Title"].values[0]

In [98]:
# === 
# Performing cosine similarity based on news 
# FIrst, we create a copy of the dataframe because we don't want to modify the original dataframe
# With the news id, we get the corrsponding category. This is used to subset the dataframe because otherwise it is too large
#  With the modified dataframe we perform cosine similarity
# Then we check which articles are the closest to the current news article
# The top 10 are chosen 
# Input: String news id, dataframe news data 
# Output: print title news article 
# === 

def get_similar_news(index_id, news_data):
    
    # Create a copy of the dataframe
    data = news_data.copy()
    
    # Get category
    category = data.Category[data.News_id==index_id].tolist()[0]
    
    # Subset dataframe
    data = data[data.Category == category]
    
    # Merge the columns to get one column used for cosine similarity
    data['merge'] = data['News_id'].apply(merge_feature)
    
    # Reset index, needed to search for article titles 
    data = data.reset_index()
    data['Index_id'] = data.index

    # Cosine similarity 
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(data["merge"])
    cosine_sim = cosine_similarity(count_matrix)
    
    # Get similar news based on the current article 
    similar_news = list(enumerate(cosine_sim[data['Index_id'][data['News_id']==news_id].tolist()[0]]))
    sorted_similar_news = sorted(similar_news, key=lambda x:x[1], reverse=True)
    i=0
    for news in sorted_similar_news:
        if i ==0:
            print('Current news article: ' + get_title_from_index(news[0], data))
            print(' ')
        else: 
            print('Recommended news article: ' + get_title_from_index(news[0], data))
        i=i+1
        if i>10:
            break

In [104]:
# Arbitrary news id 
news_id = 'N47572'

# Perform article similarity 
get_similar_news(news_id, news_data)

Current news article: A Trump Tax Break To Help The Poor Went To a Rich GOP Donor's Superyacht Marina
 
Recommended news article: Falling investment revives attacks against Trump's tax cuts
Recommended news article: Trump Vowed to Shrink the Trade Gap. It Keeps Growing.
Recommended news article: New Jersey tax credits turned a Camden office complex into a lucrative investment. Now the feds and state AG are investigating.
Recommended news article: 22 Smart Tax Moves to Make Before the End of the Year
Recommended news article: Couple loses $15,000 to Facebook Messenger scam
Recommended news article: Bill Gates: I'm happy to pay $20 billion in taxes, but Warren's plan would make me do a little math
Recommended news article: Pennsylvania Turnpike to be cashless by Fall 2021
Recommended news article: Airlines are scrambling to work out how to tell passengers they're flying on a 737 Max when it finally returns to service, and avoid the 'chaos' it could cause if they get it wrong
Recommended 