# Create User Profile 
This script is to crawl specific websites to create a profile for our test user.  
We will use beautiful soup to parse some websites & store their texts as json.  
We are careful to include specific fields
* title
* url
* html

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
links = [
    'https://towardsdatascience.com/playing-connect-4-with-deep-q-learning-76271ed663ca', 
    'https://towardsdatascience.com/teach-your-ai-how-to-walk-5ad55fce8bca', 
    'https://www.freecodecamp.org/news/an-introduction-to-deep-q-learning-lets-play-doom-54d02d8017d8/', 
    'https://towardsdatascience.com/reinforcement-learning-demystified-markov-decision-processes-part-1-bf00dda41690'
]

In [3]:
page = requests.get(links[0])

In [4]:
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
soup.getText()

"Playing Connect 4 with Deep Q-Learning | by Lee Schmalz | Towards Data ScienceGet startedOpen in appSign inGet startedFollow579K Followers·Editors' PicksFeaturesDeep DivesGrowContributeAboutGet startedOpen in appPlaying Connect 4 with Deep Q-LearningExploring the power of Reinforcement Learning through a well-known game environmentLee SchmalzJun 24, 2020·11 min readCredit: allthefreestock.comDeep Q-Learning may be one of the most important algorithms in all of Reinforcement Learning as it lacks limitation on the observations it can make and the actions it can take within complex environments. This method of Reinforcement Learning incorporates deep neural networks in a way that allows an agent to ‘play’ an environment repeatedly and learn the environment over time through a system of observations, actions, and rewards. This structure has obvious benefits over a standard deep neural network implementation as it allows the agent to interact with its surroundings, receive feedback from it

In [6]:
def parse_links(links):
    texts = []
    for link in links: 
        page = requests.get(link)
        soup = BeautifulSoup(page.content, 'html.parser')
        texts.append(soup.getText())
    return texts

In [7]:
content = parse_links(links)

# Generate large user profile 

In [8]:
def generate_profile(content_list): 
    # Easy method: concatenation 
    return ' '.join(content_list)

In [9]:
user_profile = generate_profile(content)

# Load in TFIDF Model

In [10]:
from gensim.models import TfidfModel,LdaModel
from gensim.utils import simple_preprocess
from gensim import corpora, similarities



In [11]:
tfidf_model = TfidfModel.load('./models/tfidf-sample10000')
dictionary = corpora.Dictionary.load('./models/dictionary/sample10000Dict')
# Load corpus
loaded_corpus = corpora.MmCorpus('./models/sample10000corpus.mm')

## Preprocess User Profile

In [12]:
user_text = simple_preprocess(user_profile)
user_tfidf_vec = tfidf_model[dictionary.doc2bow(user_text)]

In [13]:
# This should be done at start -> Can take a while
index = similarities.Similarity(None, tfidf_model[loaded_corpus], len(dictionary))

In [14]:
# This also takes a second. The consine similarity is done here
res = index[user_tfidf_vec]

In [15]:
# Print top 10 documents 
print(list(sorted(enumerate(res), key=lambda x: x[1], reverse=True))[:10])

[(5321, 0.31066835), (555, 0.29828316), (2193, 0.29372314), (1981, 0.28961307), (3182, 0.25882074), (9372, 0.23132288), (5581, 0.22526635), (9142, 0.22428486), (5046, 0.22016639), (4975, 0.21627207)]


In [16]:
loaded_corpus.num_docs

10000

In [17]:
for article_idx, score in sorted(enumerate(res), key=lambda x: -x[1])[:10]: 
    print(article_idx, score)

5321 0.31066835
555 0.29828316
2193 0.29372314
1981 0.28961307
3182 0.25882074
9372 0.23132288
5581 0.22526635
9142 0.22428486
5046 0.22016639
4975 0.21627207


In [18]:
def get_top_n(n, index, user_tfidf_vec): 
    """Gets the index of the top n documents similar to the user profile"""
    res = index[user_tfidf_vec]
    top_ids = []
    top_scores = []
    for article_idx, score in sorted(enumerate(res), key=lambda x: -x[1])[:n]: 
        top_ids.append(article_idx)
        top_scores.append(score)
    return top_ids, top_scores
    

In [19]:
top_ids, top_scores = get_top_n(5, index, user_tfidf_vec)

In [20]:
top_ids

[5321, 555, 2193, 1981, 3182]

# Get the Top 5 Docs

In [21]:
import itertools
import ijson

In [22]:
# Load in Dataset 
class MyCorpusJSON: 
    def __init__(self, json_link, column): 
        # idx is the index of the row where the text content is
        self.json_link = json_link 
        self.text_column = column
        self.count = 0
    
    def __len__(self): 
        return self.count
    
    def get_nth(self, n): 
        return next(itertools.islice(self.generator(), n, None))
    
    def generator(self): 
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                yield obj
                
    def __iter__(self):  
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                self.count += 1
                yield obj[self.text_column]

In [23]:
link = './data/samples/sample10000.json'
text_column = 'html_text'
mycorpus = MyCorpusJSON(link, text_column)

In [45]:
mycorpus.get_nth(3622)

{'url': 'https://medium.com/machine-learning-for-humans/reinforcement-learning-6eacf258b265',
 'title': 'Machine Learning for Humans, Part 5: Reinforcement Learning',
 'author': {'name': None,
  'url': 'https://medium.com/@v_maini',
  'twitter': '@v_maini'},
 'image_url': 'https://cdn-images-1.medium.com/max/1200/1*jne9wcY21o_e_ztLyQSPSw.png',
 'html_text': 'Vishal MainiResearch comms @DeepMindAI. Previously @Upstart, @Yale, @TrueVenturesTEC.Aug 19, 2017Machine Learning for Humans, Part 5: Reinforcement LearningExploration and exploitation. Markov decision processes. Q-learning, policy learning, and deep reinforcement learning.[Update 9/1/17] This series is now available as a full-length e-book! Download here.“I just ate some chocolate for finishing the last section.”In supervised learning, training data comes with an answer key from some godlike “supervisor”. If only life worked that way!In reinforcement learning (RL) there’s no answer key, but your reinforcement learning agent still 

In [24]:
def ids2articles(ids, corpus): 
    """Returns article objects, given a list of ids"""
    article_objects = []
    for article_id in ids: 
        article_objects.append(corpus.get_nth(article_id))
    return article_objects

In [25]:
# This takes a pretty long time. Can save time by using a db for sure. 
articles = ids2articles(top_ids, mycorpus)

In [26]:
for article, score in zip(articles, top_scores): 
    print(article['url'])
    print(article['title'])
    print(score)
    print('*'*30)

https://hackernoon.com/rational-agents-for-artificial-intelligence-caf94af2cec5
Rational Agents for Artificial Intelligence – Hacker Noon
0.31066835
******************************
https://hackernoon.com/mit-6-s094-deep-learning-for-self-driving-cars-2018-lecture-3-notes-deep-reinforcement-learning-fe9a8592e14a
MIT 6.S094: Deep Learning for Self-Driving Cars 2018 Lecture 3 Notes: Deep Reinforcement Learning
0.29828316
******************************
https://medium.com/nectec/reinforcement-learning-43ea03c2e00e
Reinforcement Learning – NECTEC – Medium
0.29372314
******************************
https://medium.com/machine-learning-for-humans/neural-networks-deep-learning-cdad8aeae49b
Machine Learning for Humans, Part 4: Neural Networks & Deep Learning
0.28961307
******************************
https://hackernoon.com/nathan-ai-newsletter-issue-21-part-2-2-e6e0e7ab3ce
Nathan.ai newsletter Issue #21 — Part 2/2 – Hacker Noon
0.25882074
******************************


# Recommend With LDA

In [27]:
lda_model = LdaModel.load('./models/lda-sample10000')

In [28]:
# Process User text 
lda_vec = lda_model[dictionary.doc2bow(user_text)]

In [29]:
# Use LDA for similarity index. Also takes forever to load in
lda_index = similarities.Similarity(None, lda_model[loaded_corpus], len(dictionary))

array([0.77258664, 0.8379322 , 0.53435254, ..., 0.29647186, 0.8981663 ,
       0.7700697 ], dtype=float32)

In [30]:
lda_top_ids, lda_top_scores = get_top_n(5, lda_index, lda_vec)

In [31]:
lda_articles = ids2articles(top_ids, mycorpus)

In [32]:
for article, score in zip(lda_articles, lda_top_scores): 
    print(article['url'])
    print(article['title'])
    print(score)
    print('*'*30)

https://hackernoon.com/rational-agents-for-artificial-intelligence-caf94af2cec5
Rational Agents for Artificial Intelligence – Hacker Noon
0.97150904
******************************
https://hackernoon.com/mit-6-s094-deep-learning-for-self-driving-cars-2018-lecture-3-notes-deep-reinforcement-learning-fe9a8592e14a
MIT 6.S094: Deep Learning for Self-Driving Cars 2018 Lecture 3 Notes: Deep Reinforcement Learning
0.960182
******************************
https://medium.com/nectec/reinforcement-learning-43ea03c2e00e
Reinforcement Learning – NECTEC – Medium
0.94829416
******************************
https://medium.com/machine-learning-for-humans/neural-networks-deep-learning-cdad8aeae49b
Machine Learning for Humans, Part 4: Neural Networks & Deep Learning
0.9453678
******************************
https://hackernoon.com/nathan-ai-newsletter-issue-21-part-2-2-e6e0e7ab3ce
Nathan.ai newsletter Issue #21 — Part 2/2 – Hacker Noon
0.9409206
******************************


# Results

Although the scorin metrics may be different, the LDA and TFIDF models both return the same top 5 articles for this dataset. 
This dataset contained 5000 articles sampled randomly from the original dataset of ~65000 medium articles

# Test User Details

email: doe@example.com
password: asdf

classes: 

In [34]:
{'code': 'INSY404', 'title': "Introduction to Object Oriented Design"},
{'code': 'COSC418', 'title': "Introduction to Analog Computing"},
{'code': 'COSC408', 'title': "Modelling and Simulations"},
{'code': 'INSY402', 'title': "Internet Technologies"}
{'code': 'COSC423', 'title': "Artificial Intelligence"}
{'code': 'COSC417', 'title': "Database Design and Management"}
{'code': 'COSC430', 'title': "Hands-on Java Training"},


{'code': 'INSY402', 'title': 'Internet Technologies'}