# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [21]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion, link, poll
import csv
from tqdm import tqdm 

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [22]:
# Discussion items
file_path_discussion = "../data/discussion.csv"
row_head_discussion = ["Title",
            "Item code",
            "Banner data",
            "Main link",
            "Body links"
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_discussion, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_discussion)

In [23]:
# Link items
file_path_link = "../data/link.csv"
row_head_link = ["Title",
                 "Item code",
                 "Banner data",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_link, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_link)


In [24]:
# Poll items
file_path_poll = "../data/poll.csv"
row_head_poll = ["Title",
                 "Item code",
                 "Banner data",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_poll, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_poll)


In [25]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 200)


In [27]:
print(sampled_items)

[3323, 3922, 40741, 78461, 63606, 34365, 44780, 132684, 46573, 77624, 130170, 31163, 166775, 90743, 80430, 67451, 25163, 42903, 79966, 122948, 50889, 174903, 44410, 60950, 74737, 111386, 8655, 105190, 102749, 65171, 141540, 196137, 186027, 181669, 78123, 157280, 156053, 60103, 56452, 130046, 170498, 192510, 86090, 145460, 23479, 187038, 126282, 33828, 82363, 87959, 156753, 32679, 70911, 94635, 45587, 141806, 41771, 75153, 26866, 181866, 178167, 123932, 134729, 186077, 21381, 174042, 73868, 35957, 158461, 165152, 28422, 13920, 33206, 35664, 1820, 158940, 139557, 141973, 60715, 55352, 142898, 42482, 199530, 15369, 136309, 188265, 77057, 98456, 93262, 187357, 144555, 25451, 143357, 101862, 96320, 152721, 48785, 12235, 125130, 27288, 61873, 117003, 198338, 48191, 77362, 139033, 44729, 98135, 130035, 60578, 125489, 29299, 174540, 90391, 99674, 13430, 37859, 175721, 15049, 140446, 113276, 12066, 47999, 111370, 178301, 99918, 32648, 132993, 113088, 73064, 153262, 57362, 6802, 172771, 175071, 

In [28]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_link, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
                
        elif item.detect_item_type(i, soup)=='poll':
            entry = [poll.extract_title(soup),
                     str(i),
                     poll.extract_banner(soup),
                     poll.extract_body_links(soup),
                     poll.extract_comment_stacked(soup),
                     poll.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_poll, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


100%|██████████| 200/200 [02:24<00:00,  1.38it/s]
