In [6]:
import pandas as pd, numpy as np
import requests

import bs4

import json

In [12]:
valid_topic_ids = ['891-317', '109-185']

def _scrape_articles_urls_from_topic(topic_id: str, year: int, month: int):
    if year < 2000:
        year = 2000 + year if year < 30 else 1900 + year
    if month < 10:
        month = '0' + str(month)
    url = f'https://www.ynet.co.il/home/0,7340,L-4269-{topic_id}-{year}{month}-1,00.html'
    html = bs4.BeautifulSoup(requests.get(url).content)
    html = html.find_all(attrs={'class': 'ghciArticleIndex1'})[0].find_all('table')[1]

    articles_htmls = html.find_all(attrs={'class': 'smallheader'})
    articles_urls = ['https://www.ynet.co.il' + article.get('href')
                    for article in articles_htmls]
    return articles_urls

def scrape_articles():
    articles_urls = []
    for topic_id in valid_topic_ids:
        for year in range(2000,2021 + 1):
            for month in range(1,12 + 1):
                try:
                    urls = _scrape_articles_urls_from_topic(topic_id, year, month)
                    assert type(urls) == list
                    articles_urls += urls
                except:
                    print(topic_id, year, month, 'failed')
                    pass
    return list(set(articles_urls))

In [13]:
def save_list(l):
    with open('articles_urls.txt', 'w') as f:
        for item in l:
            f.write("%s\n" % item)
def read_list(path = 'articles_urls.txt'):
    with open(path, 'r+') as f:
        lines = f.readlines()
    lines = pd.Series(lines).apply(lambda s: s.strip('\n'))
    return lines

In [14]:
articles_urls = read_list()

In [15]:
articles_urls.head().apply(print)
pass

https://www.ynet.co.il/articles/0,7340,L-5439025,00.html
https://www.ynet.co.il/articles/0,7340,L-1401169,00.html
https://www.ynet.co.il/articles/0,7340,L-4633999,00.html
https://www.ynet.co.il/news/article/r1B28Y00H00
https://www.ynet.co.il/articles/0,7340,L-4292071,00.html


In [19]:
def get_article(article_url: str):
    try:
        print(article_url)
        if article_url.startswith('https'):
            article_url = article_url.replace('https', 'http')
        html = bs4.BeautifulSoup(requests.post(article_url, timeout=(100.05, 1000)).content)
        ld_json = html.find_all(attrs={'type' : 'application/ld+json'})[0]
        ld_json = json.loads([l for l in ld_json.children][0])
        main_title = ld_json['headline']
        sub_title = ld_json['description']
        content = ld_json['articleBody']

        # main_title = html.find_all(attrs={'class' : 'mainTitle'})[0].text.strip('"')
        # sub_title = html.find_all(attrs={'class' : 'subTitle'})[0].text.strip('"')
        # content = html.find_all(attrs={'class' : 'public-DraftEditor-content'})[0].text.strip('"')
        return pd.Series({
            'main_title' : main_title, 
            'sub_title' : sub_title, 
            'content' : content
        })
    except:
        print(f'{article_url} failed')
        return None

In [20]:
from multiprocessing import Pool

def multiprocess(l, f, n_jobs = -1, as_series: bool = False):
    with Pool(n_jobs) as p:
        output = p.map(f, l)
    if as_series:
        return pd.Series(output)
    else:
        return output

In [24]:
N_ARTICLES = 100
data = pd.DataFrame(multiprocess(
    np.random.choice(articles_urls, N_ARTICLES),
    get_article,
    n_jobs = 5
))

https://www.ynet.co.il/articles/0,7340,L-3122873,00.html
https://www.ynet.co.il/articles/0,7340,L-2696876,00.html
https://www.ynet.co.il/articles/0,7340,L-2984417,00.html
https://www.ynet.co.il/articles/0,7340,L-2810080,00.html
https://www.ynet.co.il/articles/0,7340,L-3932531,00.html
http://www.ynet.co.il/articles/0,7340,L-3122873,00.html failed
http://www.ynet.co.il/articles/0,7340,L-2984417,00.html failed
http://www.ynet.co.il/articles/0,7340,L-2696876,00.html failed
https://www.ynet.co.il/articles/0,7340,L-2137893,00.html
https://www.ynet.co.il/articles/0,7340,L-3108554,00.html
https://www.ynet.co.il/articles/0,7340,L-3432197,00.html


KeyboardInterrupt: ignored

In [23]:
data

0                                                  None
1     main_title                         השר ישראל כ...
2     main_title               השר אריאל: נפחית התמי...
3                                                  None
4     main_title       בקואליציה בוחנים: פתיחת התאגי...
                            ...                        
95    main_title                  השר לפיד: "הממשלה ...
96    main_title          יחימוביץ' על איראן: לא צרי...
97    main_title              מכתבי הפיטורים: "ככה פ...
98                                                 None
99    main_title       "הנהג התעסק בקופסה השחורה כשג...
Length: 100, dtype: object

In [None]:
content = data['sub_title']
titles = data['main_title']

In [None]:
X = content.str.split()
y = titles.str.split()
X.sample()

2    [פרסום, ראשון:, הוועדה, למינוי, בכירים, אישרה,...
Name: sub_title, dtype: object

In [None]:
data.apply(lambda r: [(r['content'].split().index(word) if word in r['content'].split() else -1) for word in r['sub_title'].split()], axis = 1)

0    [259, 11, 261, 15, 16, -1, 3, 4, 23, 24, 25, 2...
1    [-1, -1, 169, 19, -1, 40, 42, 43, -1, 16, -1, ...
2    [-1, -1, 195, 9, 19, 22, 23, 26, 27, 28, 29, 3...
3    [-1, 375, 376, 16, 264, -1, -1, 61, 0, 1, -1, ...
4    [5, 6, -1, 127, 128, -1, 12, -1, -1, 162, -1, ...
dtype: object